Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

assessment example denormalizations #22

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
25 changes: 13 additions & 12 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,8 @@ your export. To create a docker container with an already downloaded export

After creation use the ``list`` command to check the status of the
container and view the container name, database name, address and port to
connect to the database. Using docker, you can connect to the database by
running::

docker run -it --rm --link $CONTAINER_NAME postgres:9.5 psql -h $CONTAINER_NAME -d $DATABASE_NAME -U postgres

If you have ``psql`` installed, you can connect with::

psql -p $HOST_PORT -h $HOST_IP -d $DATABASE_NAME -U postgres

with the parameters provided by ``list``.
connect to the database. Use the `db connect $CONTAINER_NAME` command to open
a psql shell.

list
~~~~
Expand Down Expand Up @@ -189,17 +181,26 @@ Remove a container::
db
^^

connect
~~~~~~~
Open a shell to a postgres database::

courseraresearechexports db connect $CONTAINER_NAME

create_view
~~~~~~~~~~~
Create a view in the postgres database. We are planning to include commonly
used denormalized views as part of this project. To create one of these views
(i.e. for the demographic_survey view)::

courseraresearchexports db create_view $CONTAINER_NAME --view_name demographic_survey --partner_short_name $PARTNER_SHORT_NAME
courseraresearchexports db create_view $CONTAINER_NAME --view_name demographic_survey

If you have your own sql script that you'd like to create as a view::

courseraresearchexports db create_view $CONTAINER_NAME --sql_file /path/to/sql/file/ --partner_short_name $PARTNER_SHORT_NAME
courseraresearchexports db create_view $CONTAINER_NAME --sql_file /path/to/sql/file/

Note: as `user_id` columns vary with partner and user id hashing, please refer
to the exports guide for SQL formatting guidelines.

unload_to_csv
~~~~~~~~~~~~~
Expand Down
5 changes: 3 additions & 2 deletions courseraresearchexports/commands/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import logging

from tabulate import tabulate
Expand Down Expand Up @@ -66,8 +68,7 @@ def list_containers(args):
container_info.host_port
])

logging.info('\n' + tabulate(containers_info_table,
headers='firstrow'))
print(tabulate(containers_info_table, headers='firstrow'))


def start_container(args):
Expand Down
33 changes: 23 additions & 10 deletions courseraresearchexports/commands/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,31 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import logging

from tabulate import tabulate

import courseraresearchexports.db.db as db
from courseraresearchexports.containers import utils


def connect(args):
"""
Connect postgres shell to dockerized database.
"""
d = utils.docker_client(args.docker_url, args.timeout)
db.connect(args.container_name, docker_client=d)


def list_tables(args):
"""
List all of the tables present in a dockerized database.
"""
d = utils.docker_client(args.docker_url, args.timeout)
tables = db.get_table_names(args.container_name, docker_client=d)
logging.info('\n' + tabulate([[table] for table in tables]))
print(tabulate([[table] for table in tables]))


def list_views(args):
Expand All @@ -34,7 +45,7 @@ def list_views(args):
"""
d = utils.docker_client(args.docker_url, args.timeout)
tables = db.get_view_names(args.container_name, docker_client=d)
logging.info('\n' + tabulate([[table] for table in tables]))
print(tabulate([[table] for table in tables]))


def create_view(args):
Expand All @@ -45,12 +56,10 @@ def create_view(args):

if args.view_name:
created_view = db.create_registered_view(
args.container_name, args.view_name,
args.partner_short_name, d)
args.container_name, args.view_name, d)
elif args.sql_file:
created_view = db.create_view_from_file(
args.container_name, args.sql_file,
args.partner_short_name, d)
args.container_name, args.sql_file, d)

logging.info('Created view {}'.format(created_view))

Expand Down Expand Up @@ -108,10 +117,6 @@ def parser(subparsers):
create_source_subparser.add_argument(
'--sql_file',
help='SQL file with query.')
parser_create_view.add_argument(
'--partner_short_name',
help='Your partner short name.',
required=True)

parser_unload = db_subparsers.add_parser(
'unload_to_csv',
Expand All @@ -127,4 +132,12 @@ def parser(subparsers):
'--relation',
help='Table or view to export.')

parser_connect = db_subparsers.add_parser(
'connect',
help=connect.__doc__)
parser_connect.set_defaults(func=connect)
parser_connect.add_argument(
'container_name',
help='Name of the container database.')

return parser_db
13 changes: 8 additions & 5 deletions courseraresearchexports/commands/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from tabulate import tabulate
from __future__ import print_function

import json
import argparse
import logging

import argparse
from tabulate import tabulate

from courseraresearchexports.exports import api
from courseraresearchexports.constants.api_constants import \
ANONYMITY_LEVEL_COORDINATOR, EXPORT_TYPE_CLICKSTREAM, \
Expand Down Expand Up @@ -105,7 +108,7 @@ def get(args):
export_request_info.append(
['Interval:', ' to '.join(export_request.interval)])

logging.info('\n' + tabulate(export_request_info, tablefmt="plain"))
print(tabulate(export_request_info, tablefmt="plain"))


def get_all(args):
Expand All @@ -126,7 +129,7 @@ def get_all(args):
export_request.scope_id,
export_request.schema_names_display])

logging.info('\n' + tabulate(export_requests_table, headers='firstrow'))
print(tabulate(export_requests_table, headers='firstrow'))


def download(args):
Expand Down Expand Up @@ -158,7 +161,7 @@ def get_clickstream_links(args):
clickstream_links_request)

# TODO: add more descriptive information or option write to text file
logging.info('\n' + tabulate(
print(tabulate(
[[link] for link in clickstream_download_links],
tablefmt="plain"))

Expand Down
4 changes: 3 additions & 1 deletion courseraresearchexports/constants/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
__all__ = [
"api_constants"
"api_constants",
"db_constants",
"container_constants"
]

from . import * # noqa
7 changes: 7 additions & 0 deletions courseraresearchexports/constants/container_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os

COURSERA_DOCKER_LABEL = 'courseraResearchExport'
COURSERA_LOCAL_FOLDER = os.path.expanduser('~/.coursera/exports/')
POSTGRES_DOCKER_IMAGE = 'postgres:9.5'
POSTGRES_INIT_MSG = 'PostgreSQL init process complete; ready for start up.'
POSTGRES_READY_MSG = 'database system is ready to accept connections'
23 changes: 23 additions & 0 deletions courseraresearchexports/constants/db_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright 2016 Coursera
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE = {
'[partner_user_id]': 'users',
'[demographics_user_id]': 'demographics_answers',
'[feedback_user_id]': 'feedback_course_ratings',
'[assessments_user_id]': 'assessment_actions',
'[peer_assignments_user_id]': 'peer_submissions',
'[discussions_user_id]': 'discussion_answers',
'[programming_assignments_user_id]': 'programming_submissions',
}
12 changes: 4 additions & 8 deletions courseraresearchexports/containers/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,14 @@
import time

from courseraresearchexports import exports
from courseraresearchexports.models.ContainerInfo import ContainerInfo
from courseraresearchexports.constants.api_constants import \
EXPORT_TYPE_TABLES
from courseraresearchexports.constants.container_constants import \
COURSERA_DOCKER_LABEL, COURSERA_LOCAL_FOLDER, POSTGRES_DOCKER_IMAGE, \
POSTGRES_INIT_MSG, POSTGRES_READY_MSG
from courseraresearchexports.containers import utils as container_utils
from courseraresearchexports.exports import utils as export_utils


COURSERA_DOCKER_LABEL = 'courseraResearchExport'
COURSERA_LOCAL_FOLDER = os.path.expanduser('~/.coursera/exports/')
POSTGRES_DOCKER_IMAGE = 'postgres:9.5'
POSTGRES_INIT_MSG = 'PostgreSQL init process complete; ready for start up.'
POSTGRES_READY_MSG = 'database system is ready to accept connections'
from courseraresearchexports.models.ContainerInfo import ContainerInfo


def list_all(docker_client):
Expand Down
86 changes: 76 additions & 10 deletions courseraresearchexports/db/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,76 @@
import os
import logging
import pkg_resources
import subprocess

from courseraresearchexports.constants.container_constants import \
POSTGRES_DOCKER_IMAGE
from courseraresearchexports.models.ContainerInfo import ContainerInfo
from courseraresearchexports.models.ExportDb import ExportDb
from courseraresearchexports.constants.db_constants import \
HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE


def replace_user_id_placeholders(export_db, sql_text):
"""
Replace placeholders with actual user_id column names
:param export_db:
:param sql_text:
:return sql_text_with_inferred_columns:
"""
hashed_user_id_columns_dict = infer_hashed_user_id_columns(export_db)

for placeholder, column_name in hashed_user_id_columns_dict.items():
sql_text = sql_text.replace(placeholder, column_name)

return sql_text


def infer_hashed_user_id_columns(export_db):
"""
Infer hashed_user_id_columns from database using known placeholders
:param export_db:
:return:
"""
hashed_user_id_columns_dict = {}

for placeholder, table in HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE.items():
if table in export_db.tables:
columns = export_db.get_columns(table)
inferred_column = infer_user_id_column(columns)
if inferred_column:
hashed_user_id_columns_dict[placeholder] = inferred_column

return hashed_user_id_columns_dict


def infer_user_id_column(columns):
"""
Infer partner_short_name
:param columns:
:return:
"""
return next((column for column in columns
if column.endswith('user_id')), None)


def connect(container_name, docker_client):
"""
Create psql shell to container databaise
:param container_name:
:param docker_client:
"""
container_info = ContainerInfo.from_container(
container_name, docker_client)

subprocess.call([
'docker', 'run', '-it', '--rm',
'--link', container_info.name,
POSTGRES_DOCKER_IMAGE, 'psql',
'-h', container_info.name,
'-d', container_info.database_name,
'-U', 'postgres'
], shell=False)


def get_table_names(container_name, docker_client):
Expand Down Expand Up @@ -62,8 +130,7 @@ def unload_relation(container_name, dest, relation, docker_client):
return rowcount


def create_registered_view(container_name, view_name, partner_short_name,
docker_client):
def create_registered_view(container_name, view_name, docker_client):
"""
Create a prepackaged view
:param container_name:
Expand All @@ -76,16 +143,15 @@ def create_registered_view(container_name, view_name, partner_short_name,

sql_text = pkg_resources.resource_string(
__name__.split('.')[0], 'sql/{}.sql'.format(view_name))
sql_text_with_partner_short_name = sql_text.replace(
'[partner_short_name]', partner_short_name)
sql_text_with_inferred_columns = replace_user_id_placeholders(
export_db, sql_text)

export_db.create_view(view_name, sql_text_with_partner_short_name)
export_db.create_view(view_name, sql_text_with_inferred_columns)

return view_name


def create_view_from_file(container_name, sql_file, partner_short_name,
docker_client):
def create_view_from_file(container_name, sql_file, docker_client):
"""
Create a view from a sql file.
:param container_name:
Expand All @@ -100,9 +166,9 @@ def create_view_from_file(container_name, sql_file, partner_short_name,
sql_text = sf.read()

view_name = os.path.split(os.path.basename(sql_text))[0]
sql_text_with_partner_short_name = sql_text.replace(
'[partner_short_name]', partner_short_name)
sql_text_with_inferred_columns = replace_user_id_placeholders(
export_db, sql_text)

export_db.create_view(view_name, sql_text_with_partner_short_name)
export_db.create_view(view_name, sql_text_with_inferred_columns)

return view_name
9 changes: 9 additions & 0 deletions courseraresearchexports/models/ExportDb.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,15 @@ def unload_relation(self, relation, output_filename):
rowcount = self.unload(query, output_filename)
return rowcount

def get_columns(self, table):
"""
Names of all the columns in a table.
:param table:
:return columns:
"""
insp = reflection.Inspector.from_engine(self.engine)
return [column['name'] for column in insp.get_columns(table)]

@property
def tables(self):
"""
Expand Down
Loading