MAPC · anaximander · Dec 14, 2023
diff --git a/.env.template b/.env.template
@@ -1,8 +1,13 @@
-OUT_FILE_PATH='.'
-OUT_FILE_NAME='test,csv'
-DB_USER='rental-listing-aggregator'
-DB_PASSWORD='password'
-DB_HOST='127.0.0.1'
+MAPPER_YEAR=2020
+MAPPER_MONTH=1
+#MAPPER_QUARTER=1
+
+OUT_FILE_PATH=./output/
+OUT_FILE_NAME=mapped.csv
+OUT_TABLE_NAME=mapped
+
+DB_USER=rental-listing-aggregator
+DB_PASSWORD=password
+DB_HOST=127.0.0.1
 DB_PORT=5432
-DB_NAME='rental-listing-aggregator'
-MAPPER_MONTH='6'
+DB_NAME=rental-listing-aggregator
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 .DS_Store
 .env
+
+virtualenv_*
+output/**
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.8
diff --git a/Pipfile b/Pipfile
@@ -6,14 +6,16 @@ verify_ssl = true
 [dev-packages]
 
 [packages]
-numpy = "==1.19.0"
-pandas = "==1.0.5"
-psycopg2 = "==2.8.5"
-python-dateutil = "==2.8.1"
-python-dotenv = "==0.13.0"
-pytz = "==2020.1"
-six = "==1.15.0"
-SQLAlchemy = "==1.3.18"
+numpy = "==1.24.4"
+pandas = "==2.0.3"
+psycopg2 = "==2.9.9"
+python-dateutil = "==2.8.2"
+python-dotenv = "==1.0.0"
+pytz = "==2023.3.post1"
+six = "==1.16.0"
+tzdata = "==2023.3"
+sqlalchemy = "==2.0.23"
+typing-extensions = "==4.9.0"
 
 [requires]
 python_version = "3.8"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,5 +1,20 @@
 # Rental Listing Mapper
 
-This repository pulls the table that is populated by the 
-[scraper](https://github.com/mapc/rental-listing-scraper) module
-and maps the data into a format consumable by the [cleaner](https://github.com/mapc/rental-listing-cleaner)
+## Overview
+
+This repository pulls the table that is populated by the [scraper](https://github.com/mapc/rental-listing-scraper) module and maps the data into a format consumable by the [cleaner](https://github.com/mapc/rental-listing-cleaner)
+
+In addition to creating a CSV in this format, this process also writes the mapped data to a `mapped` table in the `rental-listings-aggregator` database.
+
+## Running the code
+
+### Setup
+First, you'll need to set up your environment variables. These can be set using a `.env` file in the root of this project. A template (`.env.template`) has been provided as an example. The production values are saved as a secure note in Dashlane.
+
+Create a virtual environment if you haven't already: `python -m venv virtualenv_mapper`
+
+Enter the virtual environment: `source virtualenv_mapper/bin/activate`
+
+Then, install the requirements: `pip install -r requirements.txt`
+
+With all of these steps completed, you can run the mapper: `python map.py`
diff --git a/map → map.py b/map → map.py
@@ -1,26 +1,33 @@
 #!/usr/bin/env python3
 
 import json
-import sqlalchemy
-import pandas as pd
 from sys import exit
 from os import environ, path
 from datetime import datetime
 from datetime import date
 from dateutil.relativedelta import *
 
+import sqlalchemy
+import pandas as pd
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
 def longmap(record):
     if 'lng' in record:
         return float(record['lng'])
     else:
         return float(record['Longitude'])
 
+
 def latmap(record):
     if 'lat' in record:
         return float(record['lat'])
     else:
         return float(record['Latitude'])
 
+
 if 'MAPPER_YEAR' in environ:
     YEAR = int(environ['MAPPER_YEAR'])
 else:
@@ -50,8 +57,11 @@ def latmap(record):
         next_month=datetime.now().strftime('%m')
     )
 
+print("Reading raw listings from DB...")
 engine = sqlalchemy.create_engine('postgresql://{}:{}@{}:{}/{}'.format(environ['DB_USER'], environ['DB_PASSWORD'], environ['DB_HOST'], environ['DB_PORT'], environ['DB_NAME']))
 df = pd.read_sql_query(sqlalchemy.text('SELECT * FROM listings WHERE \'{range}\'::tsrange @> last_seen'.format(range=RANGE)), engine)
+
+print("Mapping data to output format...")
 df.rename(columns={'posting_date': 'post_at'}, inplace=True)
 
 df = df.dropna(subset=['payload'])
@@ -61,4 +71,9 @@ def latmap(record):
 
 mapped = df[['uid', 'ask', 'bedrooms', 'title', 'address', 'post_at', 'created_at', 'updated_at', 'source_id', 'survey_id', 'latitude', 'longitude']]
 
+print(f"Writing to {environ['OUT_FILE_PATH']}{environ['OUT_FILE_NAME']}...")
 mapped.to_csv(path.join(environ['OUT_FILE_PATH'], environ['OUT_FILE_NAME']), index=False, header=False)
+print(f"Writing to {environ['OUT_TABLE_NAME']} table...")
+mapped.to_sql(environ['OUT_TABLE_NAME'], engine, if_exists='append', index=False, chunksize=1000)
+
+print("Done.")
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,10 @@
-numpy==1.19.0
-pandas==1.0.5
-psycopg2==2.8.5
-python-dateutil==2.8.1
-python-dotenv==0.13.0
-pytz==2020.1
-six==1.15.0
-SQLAlchemy==1.3.18
+numpy==1.24.4
+pandas==2.0.3
+psycopg2==2.9.9
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3.post1
+six==1.16.0
+SQLAlchemy==2.0.23
+typing_extensions==4.9.0
+tzdata==2023.3