Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSV: correctly serialize booleans and dates. #3841

Merged
merged 6 commits into from
May 29, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions redash/handlers/query_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from redash.tasks.queries import enqueue_query
from redash.utils import (collect_parameters_from_request, gen_query_hash, json_dumps, utcnow, to_filename)
from redash.models.parameterized_query import ParameterizedQuery, InvalidParameterError, dropdown_values
from redash.serializers import serialize_query_result_to_csv, serialize_query_result_to_xlsx


def error_response(message):
Expand Down Expand Up @@ -279,12 +280,12 @@ def make_json_response(self, query_result):
@staticmethod
def make_csv_response(query_result):
headers = {'Content-Type': "text/csv; charset=UTF-8"}
return make_response(query_result.make_csv_content(), 200, headers)
return make_response(serialize_query_result_to_csv(query_result), 200, headers)

@staticmethod
def make_excel_response(query_result):
headers = {'Content-Type': "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}
return make_response(query_result.make_excel_content(), 200, headers)
return make_response(serialize_query_result_to_xlsx(query_result), 200, headers)


class JobResource(BaseResource):
Expand Down
40 changes: 1 addition & 39 deletions redash/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import cStringIO
import csv
import datetime
import calendar
import logging
import time
import pytz

import xlsxwriter
from six import python_2_unicode_compatible, text_type
from sqlalchemy import distinct, or_, and_, UniqueConstraint
from sqlalchemy.dialects import postgresql
Expand All @@ -25,7 +22,7 @@
get_destination)
from redash.metrics import database # noqa: F401
from redash.query_runner import (get_configuration_schema_for_query_runner_type,
get_query_runner)
get_query_runner, TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME)
from redash.utils import generate_token, json_dumps, json_loads
from redash.utils.configuration import ConfigurationContainer
from redash.models.parameterized_query import ParameterizedQuery
Expand Down Expand Up @@ -322,41 +319,6 @@ def store_result(cls, org, data_source, query_hash, query, data, run_time, retri
def groups(self):
return self.data_source.groups

def make_csv_content(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Used the opportunity to move this serialization logic from redash.models to redash.serializers.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! and +1 for tests!

s = cStringIO.StringIO()

query_data = json_loads(self.data)
writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=[col['name'] for col in query_data['columns']])
writer.writer = utils.UnicodeWriter(s)
writer.writeheader()
for row in query_data['rows']:
writer.writerow(row)

return s.getvalue()

def make_excel_content(self):
s = cStringIO.StringIO()

query_data = json_loads(self.data)
book = xlsxwriter.Workbook(s, {'constant_memory': True})
sheet = book.add_worksheet("result")

column_names = []
for (c, col) in enumerate(query_data['columns']):
sheet.write(0, c, col['name'])
column_names.append(col['name'])

for (r, row) in enumerate(query_data['rows']):
for (c, name) in enumerate(column_names):
v = row.get(name)
if isinstance(v, list) or isinstance(v, dict):
v = str(v).encode('utf-8')
sheet.write(r + 1, c, v)

book.close()

return s.getvalue()


def should_schedule_next(previous_iteration, now, interval, time=None, day_of_week=None, failures=0):
# if time exists then interval > 23 hours (82800s)
Expand Down
2 changes: 2 additions & 0 deletions redash/serializers.py → redash/serializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from redash.utils import json_loads
from redash.models.parameterized_query import ParameterizedQuery

from .query_result import serialize_query_result_to_csv, serialize_query_result_to_xlsx


def public_widget(widget):
res = {
Expand Down
92 changes: 92 additions & 0 deletions redash/serializers/query_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import cStringIO
import csv
import xlsxwriter
from dateutil.parser import parse as parse_date
from redash.utils import json_loads, UnicodeWriter
from redash.query_runner import (TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME)
from redash.authentication.org_resolving import current_org


def convert_format(fmt):
return fmt.replace('DD', '%d').replace('MM', '%m').replace('YYYY', '%Y').replace('YY', '%y').replace('HH', '%H').replace('mm', '%M').replace('ss', '%s')

def serialize_query_result_to_csv(query_result):
s = cStringIO.StringIO()

query_data = json_loads(query_result.data)

fieldnames = []
bool_columns = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels a tad too scripty for me. I'd prefer to have less details and noise here.
Perhaps this function could be split to several chunks?

  1. Calling a function to slice query_data['columns'] to three lists:
bool_columns, date_columns, datetime_columns = split_columns_by_type(query_data['columns'])
  1. Handling csv.DictWriter stuff and for each row:
    2a. Calling a function to format booleans
    3b. Calling a function to format dates
    4c. Calling a function to format date times

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've started with implementing 1 (5e0ccb9), but then when I tried to split it further I realized I will need to pass around the lists 🤢or convert this into a class 🤔. So ended up doing this instead: feda3c9.

I didn't do this initially as it felt too much, but it does clean the code and makes it easy to add support for new types later.

date_columns = []
datetime_columns = []

for col in query_data['columns']:
fieldnames.append(col['name'])
if col['type'] == TYPE_BOOLEAN:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These feel pretty mutually exclusive, so an elif might be in place.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bool_columns.append(col['name'])

if col['type'] == TYPE_DATE:
date_columns.append(col['name'])

if col['type'] == TYPE_DATETIME:
datetime_columns.append(col['name'])

writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=[col['name'] for col in query_data['columns']])
writer.writer = UnicodeWriter(s)
writer.writeheader()
for row in query_data['rows']:

for col in bool_columns:
if col in row:
if row[col] == True:
row[col] = "true"
elif row[col] == False:
row[col] = "false"

for col in date_columns:
if not row[col]:
continue

if col in row:
parsed = parse_date(row[col])

row[col] = parsed.strftime(convert_format(current_org.get_setting('date_format')))

for col in datetime_columns:
if not row[col]:
continue

if col in row:
parsed = parse_date(row[col])

fmt = convert_format('{} {}'.format(current_org.get_setting('date_format'), current_org.get_setting('time_format')))
row[col] = parsed.strftime(fmt)


writer.writerow(row)

return s.getvalue()


def serialize_query_result_to_xlsx(query_result):
s = cStringIO.StringIO()

query_data = json_loads(query_result.data)
book = xlsxwriter.Workbook(s, {'constant_memory': True})
sheet = book.add_worksheet("result")

column_names = []
for (c, col) in enumerate(query_data['columns']):
sheet.write(0, c, col['name'])
column_names.append(col['name'])

for (r, row) in enumerate(query_data['rows']):
for (c, name) in enumerate(column_names):
v = row.get(name)
if isinstance(v, list) or isinstance(v, dict):
v = str(v).encode('utf-8')
sheet.write(r + 1, c, v)

book.close()

return s.getvalue()
4 changes: 2 additions & 2 deletions tests/models/test_query_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from tests import BaseTestCase

from redash import models
from redash.utils import utcnow
from redash.utils import utcnow, json_dumps


class QueryResultTest(BaseTestCase):
Expand Down Expand Up @@ -66,4 +66,4 @@ def test_store_result_does_not_modify_query_update_at(self):

models.QueryResult.store_result(query.org_id, query.data_source, query.query_hash, query.query_text, "", 0, utcnow())

self.assertEqual(original_updated_at, query.updated_at)
self.assertEqual(original_updated_at, query.updated_at)
Empty file added tests/serializers/__init__.py
Empty file.
54 changes: 54 additions & 0 deletions tests/serializers/test_query_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import datetime
import csv
import cStringIO

from tests import BaseTestCase

from redash import models
from redash.utils import utcnow, json_dumps
from redash.serializers import serialize_query_result_to_csv


data = {
"rows": [
{"datetime": "2019-05-26T12:39:23.026Z", "bool": True, "date": "2019-05-26"},
{"datetime": "", "bool": False, "date": ""},
{"datetime": None, "bool": None, "date": None},
],
"columns": [
{"friendly_name": "bool", "type": "boolean", "name": "bool"},
{"friendly_name": "date", "type": "datetime", "name": "datetime"},
{"friendly_name": "date", "type": "date", "name": "date"}
]
}

def get_csv_content(factory):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not stuff this inside CsvSerializationTest and avoid the passing of factory?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No good reason :) I just didn't realize at first it will need the factory, and then I moved it out it was easier to pass it the factory -> c6ff138.

query_result = factory.create_query_result(data=json_dumps(data))
return serialize_query_result_to_csv(query_result)


class CsvSerializationTest(BaseTestCase):
def test_serializes_booleans_correctly(self):
with self.app.test_request_context('/'):
parsed = csv.DictReader(cStringIO.StringIO(get_csv_content(self.factory)))
rows = list(parsed)

self.assertEqual(rows[0]['bool'], 'true')
self.assertEqual(rows[1]['bool'], 'false')
self.assertEqual(rows[2]['bool'], '')

def test_serializes_datatime_with_correct_format(self):
with self.app.test_request_context('/'):
parsed = csv.DictReader(cStringIO.StringIO(get_csv_content(self.factory)))
rows = list(parsed)

self.assertEqual(rows[0]['datetime'], '26/05/19 12:39')
self.assertEqual(rows[1]['datetime'], '')
self.assertEqual(rows[2]['datetime'], '')
self.assertEqual(rows[0]['date'], '26/05/19')
self.assertEqual(rows[1]['date'], '')
self.assertEqual(rows[2]['date'], '')