diff --git a/.circleci/config.yml b/.circleci/config.yml index 93e6342..bc4de12 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,16 +3,27 @@ jobs: build: working_directory: ~/phovea docker: - - image: caleydo/phovea_circleci_python:v2.0 + - image: circleci/python:3.7-buster-node-browsers # for node version see Dockerfile on https://hub.docker.com/r/circleci/python steps: - checkout - run: + name: Show Node.js and npm version + command: | + node -v + npm -v + - run: + name: Show Python and pip version + command: | + python --version + pip --version + - run: + name: Install Docker packages from docker_packages.txt command: | (!(test -f docker_packages.txt) || (cat docker_packages.txt | xargs sudo apt-get install -y)) - restore_cache: key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }} - run: - name: install-pip-wee + name: Install pip requirements command: | virtualenv ~/venv . ~/venv/bin/activate @@ -22,25 +33,28 @@ jobs: key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements_dev.txt" }} paths: - ~/venv - - run: #force update of VCS dependencies? - name: update-pip-vcs-dependencies + - run: + name: Force an update of pip dependencies from git repositories # not sure if this is working ? command: | . ~/venv/bin/activate pip install --upgrade --upgrade-strategy=only-if-needed -r requirements.txt - run: - name: dist + name: Show installed pip packages + command: pip list || true + - run: + name: Build command: | . ~/venv/bin/activate npm run dist - store_artifacts: path: dist - prefix: dist + destination: dist workflows: version: 2 # build-nightly: # triggers: -# - schedule: # nightly build during weekday -# cron: "15 1 * * 1-5" +# - schedule: +# cron: "15 1 * * 1-5" # "At 01:15 on every day-of-week from Monday through Friday.”, see: https://crontab.guru/#15_1_*_*_1-5 # filters: # branches: # only: diff --git a/.gitignore b/.gitignore index a00b0c0..d283702 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ __pycache__/ # due to using tox and pytest .tox .cache +package-lock.json diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 428bcb9..0000000 --- a/.travis.yml +++ /dev/null @@ -1,27 +0,0 @@ -language: python -sudo: required -env: - - TOXENV=py27 - - TOXENV=py34 - -install: - - (!(test -f docker_packages.txt) || (cat docker_packages.txt | xargs sudo apt-get install -y)) - - pip install -r requirements_dev.txt - - pip install -r requirements.txt - -script: npm run dist - -deploy: - provider: releases - api_key: - secure: TK9/P34Bi3WuppiDrBCwVcn41yCBwmILaU8hXTBzUPbT7TbeFIwsC6/4CtH85Z+ZrUve4S5pTmWRNf2dQDxWw3uYu7+bJuemV2J1LHG76mognj+TNEiYxfLQUt3Gql4W7C7FcI4Rlx5/uMN9wY1wro8TWUBMwT6jjSrUWIvK3GXoojd5bHvJx07XpjWl9wCon4D0ruZiFoM2mdeP23lbc2GckETi32oEKswnQXxkMACmxbPzoWbvkxH4aK8Bt2Rj2sl2TbPhVkN6DAkHGkGAvLI+2/aRfG27+oo3OKsaDjbuGABct8TfZccJ970CbQ8kbnCjYxstvqkg1JWjF0W67sX/flBZZOEUA5l0OLWo6HqMGMxm7/lEQhIdPMsRmvXL+HVOxkMrB2dda58QzxVwiZp+rRqUaeabPZp8Kl5xodGrVxsBvxe6zAbJ5jCtCSumG6+kLyKI00/kYlghqQNrgUw0ZsYJlQ34h3lo/24QpaeyDpQoCkGWQgtgqiXGpeKSu7bCnOqIqAy3nbT9Utwj7K8gIasTG5idosEAz/THMampNbGDuyxxc340sYGNMg9Bhm1g2ILWRdtV470p5hwBtIDTKi3/PAizEO26+Wh0zI47Sg3ao57avcbCsTmzbZUeA5J4bojmchhJCHX8su9cSCGh/2fJA/1eBIgEvOQ8LNE= - file_glob: true - file: dist/taco_server*.egg - on: - tags: true - -notifications: - slack: - secure: E8/1UIdHSczUbN+6i6gd1d5LM4vmLdwLQ30tpyjvnM0wvfDce76oPxLJAy240WJ5ybXRZUtNrttpVpt4tEXCy8aLFCmxD7s77rVloH+q1J8R/ptTFWZGhFGEujk1awEmVbzcWxJkV9/JENQaeGBKxwv8/EQwWwEkAb7p/+AJb9owmH88b3wUZUGHBWtbMiyyaF4Rm1Wg1stJB8Z1Ga7PRF4cqufTgcDdsCPVv9gAY+VxOIGqX/Vfuc9UWpUH8vq8lHUE7Inn5QS78kuFfSgLWga3H6Mu/Gko1XNlWk0QWWQBUvEZ6ZC6Wuo68KzvUjJHDTnx8WyfHue2JNHIslcX+eJq2WHLeEgM24VeNkILCGo/H/60NGHiSjrIv/Y9h6bQ9FDjo6TUyE4nbdPYN1RN9FQ5UbI9Y4Gi753H9mqnHWlEywBOzHxdZCAuz9Wh03CCF/blsvJ+Obbyo6Jrfe+g44jyi9kQdBNQ78qG6v4EXws8FiYao6x3PpgIwFix42Cpr+soAh5FpA3C1zHSAyZZpXF65/lrDl5yPNofK7Wy0B9bw+0I6Z/u7ZKFNVZXvYPGYvtUVcsALGBdmYc61+LCta36Po0KZseWVAlJj6QnOJDYzv0wvV/zsuf9A5KpYFGiqV9Q7zmtiO5FYF5sBy+lE7O9tHVO4O18IRndhRQgxhs= - on_success: change - on_failure: always diff --git a/.yo-rc.json b/.yo-rc.json index 896b193..9c31be8 100644 --- a/.yo-rc.json +++ b/.yo-rc.json @@ -27,6 +27,12 @@ "debianPackages": [], "redhatPackages": [] }, - "today": "Tue, 08 Nov 2016 08:36:05 GMT" + "today": "Tue, 08 Nov 2016 08:36:05 GMT", + "promptValues": { + "authorName": "The Caleydo Team", + "authorEmail": "contact@caleydo.org", + "authorUrl": "https://caleydo.org", + "githubAccount": "caleydo" + } } } \ No newline at end of file diff --git a/build.py b/build.py index 2a12e00..63fa1ef 100644 --- a/build.py +++ b/build.py @@ -19,7 +19,7 @@ def _resolve_plugin(repo, version): if os.path.isdir('.git') and repo: if repo.endswith('.git'): repo = repo[0:-4] - return repo + '/commit/' + _git_head('.') + return repo + '/commit/' + _git_head('.').decode('utf-8') # not a git repo return version diff --git a/buildPython.js b/buildPython.js new file mode 100644 index 0000000..c30f2e8 --- /dev/null +++ b/buildPython.js @@ -0,0 +1,69 @@ +/** + * Created by sam on 13.11.2016. + */ + +const spawnSync = require('child_process').spawnSync; +const fs = require('fs'); + +function gitHead(cwd) { + const r = spawnSync('git', ['rev-parse', '--verify', 'HEAD'], { + cwd: cwd + }); + if (!r.stdout) { + console.error(cwd, r.error); + return 'error'; + } + return r.stdout.toString().trim(); +} + +function resolvePlugin(repo, version) { + if (fs.lstatSync('.git').isDirectory() && repo) { + if (repo.endsWith('.git')) { + repo = repo.slice(0, repo.length - 4); + return repo + '/commit/' + gitHead('.'); + } + } + // not a git repo + return version; +} + +function toVersion(v) { + const now = new Date().toISOString(); + // %Y%m%d-%H%M%S + const fmt = now + .replace(/T/, ' ') + .replace(/\..+/, '') + .replace(/[-:]/, '') + .replace(' ', '-'); + return v.replace('SNAPSHOT', fmt); +} + +function _main() { + const pkg = require('./package.json'); + const name = pkg.name; + const version = toVersion(pkg.version); + const resolved = resolvePlugin((pkg.repository || {}).url, version); + + const buildInfo = { + name, + version, + resolved, + description: pkg.description, + homepage: pkg.homepage, + repository: (pkg.repository || {}).url + }; + + const l = ('build/source/' + name.toLowerCase()).split('/'); + l.forEach((_, i) => { + const path = l.slice(0, i + 1).join('/'); + if (!fs.existsSync(path)) { + fs.mkdirSync(path); + } + }); + + fs.writeFileSync('build/source/' + name.toLowerCase() + '/buildInfo.json', JSON.stringify(buildInfo, null, ' ')); +} + +if (require.main === module) { + _main(); +} diff --git a/data/olympics_generator/count_by_year.py b/data/olympics_generator/count_by_year.py index 229b97d..797f8b2 100644 --- a/data/olympics_generator/count_by_year.py +++ b/data/olympics_generator/count_by_year.py @@ -1,40 +1,41 @@ import csv import json -createdCSVs = [] +created_cvs_list = [] -def writeIndexJson(): + +def write_index_json(): with open('../index.json', 'w') as outfile: - json.dump(createdCSVs, outfile) + json.dump(created_cvs_list, outfile) -def writeCSV(year, medalType, fieldnames, medalsPerCountry): +def write_csv(year, medal_type, fieldnames, medals_per_country): if year is None: print('Invalid year -> file not written') return - name = 'Olympic Games ' + year + ' (' + medalType + ' Medals)' - filename = 'olympics_' + year + '_' + medalType.lower() + '.csv' + name = 'Olympic Games ' + year + ' (' + medal_type + ' Medals)' + filename = 'olympics_' + year + '_' + medal_type.lower() + '.csv' # sort countries by sum of all medals - sortedBySum = sorted(medalsPerCountry.items(), key=lambda x: sum(x[1].values()), reverse=True) + sorted_by_sum = sorted(medals_per_country.items(), key=lambda x: sum(x[1].values()), reverse=True) print('----------------') print('Write ' + filename) print(fieldnames) - print(sortedBySum) + print(sorted_by_sum) # get min and max value of the whole csv for the range - maxValue = float('-inf') - #minValue = float('inf') # does not work, because we fill empty cells with 0 by default + max_value = float('-inf') + # min_value = float('inf') # does not work, because we fill empty cells with 0 by default with open('../' + filename, 'wb') as output: writer = csv.DictWriter(output, fieldnames=fieldnames, restval='0', dialect='excel') writer.writeheader() - for k, v in sortedBySum: + for k, v in sorted_by_sum: values = list(v.values()) - maxValue = max(maxValue, max(values)) - #minValue = min(minValue, min(values)) + max_value = max(max_value, max(values)) + # min_value = min(min_value, min(values)) v['CountryCode'] = k writer.writerow(v) @@ -43,57 +44,59 @@ def writeCSV(year, medalType, fieldnames, medalsPerCountry): stats['name'] = name stats['path'] = filename stats['type'] = 'matrix' - stats['size'] = [len(sortedBySum), len(fieldnames)-1] # -1 = CountryCode fieldname + stats['size'] = [len(sorted_by_sum), len(fieldnames)-1] # -1 = CountryCode fieldname stats['rowtype'] = 'Country' stats['coltype'] = 'Discipline' - stats['value'] = dict(type='real', range=[0, maxValue]) + stats['value'] = dict(type='real', range=[0, max_value]) - createdCSVs.append(stats) + created_cvs_list.append(stats) print('----------------') -def readCSV(medalType = 'Total'): + +def read_csv(medal_type='Total'): with open('./MedalData1.csv', 'rb') as csvfile: - reader = csv.DictReader(csvfile, fieldnames=['Games','Sport','Event','Athlete(s)','CountryCode','CountryName','Medal','ResultInSeconds'], dialect='excel-tab') + reader = csv.DictReader(csvfile, fieldnames=['Games', 'Sport', 'Event', 'Athlete(s)', 'CountryCode', 'CountryName', 'Medal', 'ResultInSeconds'], dialect='excel-tab') next(reader) - lastGames = None + last_games = None fieldnames = ['CountryCode'] - medalsPerCountry = dict() + medals_per_country = dict() for row in reader: - if row['Games'] != lastGames: + if row['Games'] != last_games: # write old year when a new year is detected - writeCSV(lastGames, medalType, fieldnames, medalsPerCountry) + write_csv(last_games, medal_type, fieldnames, medals_per_country) # clean up variables fieldnames = ['CountryCode'] - medalsPerCountry = dict() + medals_per_country = dict() - lastGames = row['Games'] - country = row['CountryCode'] # short-cut + last_games = row['Games'] + country = row['CountryCode'] # short-cut if row['Event'] not in fieldnames: fieldnames.append(row['Event']) - if row['Medal'] == medalType or medalType is 'Total': - if country not in medalsPerCountry: - medalsPerCountry[country] = dict() - #medalsPerCountry[country]['CountryCode'] = country + if row['Medal'] == medal_type or medal_type == 'Total': + if country not in medals_per_country: + medals_per_country[country] = dict() + # medals_per_country[country]['CountryCode'] = country - if row['Event'] not in medalsPerCountry[country]: - medalsPerCountry[country][row['Event']] = 0 + if row['Event'] not in medals_per_country[country]: + medals_per_country[country][row['Event']] = 0 - medalsPerCountry[country][row['Event']] += 1 + medals_per_country[country][row['Event']] += 1 - #print(row['Games'], row['Event'], country, row['Medal']) + # print(row['Games'], row['Event'], country, row['Medal']) # write the last file - writeCSV(lastGames, medalType, fieldnames, medalsPerCountry) + write_csv(last_games, medal_type, fieldnames, medals_per_country) + -readCSV('Total') -readCSV('Bronze') -readCSV('Silver') -readCSV('Gold') +read_csv('Total') +read_csv('Bronze') +read_csv('Silver') +read_csv('Gold') -writeIndexJson() +write_index_json() diff --git a/deploy/docker-compose.partial.yml b/deploy/docker-compose.partial.yml new file mode 100644 index 0000000..745b1fd --- /dev/null +++ b/deploy/docker-compose.partial.yml @@ -0,0 +1 @@ +version: '2.0' diff --git a/package.json b/package.json index b00a772..5d5085e 100644 --- a/package.json +++ b/package.json @@ -1,4 +1,21 @@ { + "name": "taco_server", + "description": "The server part for comparing large tabular data using Phovea", + "version": "3.0.0", + "author": { + "name": "The Caleydo Team", + "email": "contact@caleydo.org", + "url": "https://caleydo.org" + }, + "license": "BSD-3-Clause", + "homepage": "http://caleydo.org", + "bugs": { + "url": "https://github.com/caleydo/taco_server/issues" + }, + "repository": { + "type": "git", + "url": "https://github.com/Caleydo/taco_server.git" + }, "files": [ "taco_server", "__init__.py", @@ -9,30 +26,13 @@ "docker_packages.txt" ], "scripts": { - "check": "flake8", + "check": "flake8 --exclude=.git,venv,deploy,docs,__pycache__,node_modules", "pretest": "npm run check", "test": "test ! -d tests || python setup.py test", "prebuild": "node -e \"process.exit(process.env.PHOVEA_SKIP_TESTS === undefined?1:0)\" || npm run test", - "build": "python build.py", + "build": "rm -rf build/source && find . -name '*.pyc' -delete && node buildPython.js && cp -r ./taco_server build/source/", "predist": "npm run build && npm run docs", - "dist": "python setup.py bdist_egg && cd build && tar cvzf ../dist/taco_server.tar.gz *", + "dist": "python setup.py sdist bdist_wheel", "docs": "sphinx-apidoc -o docs -f ./taco_server && sphinx-build ./docs build/docs" - }, - "name": "taco_server", - "description": "The server part for comparing large tabular data using Phovea", - "homepage": "http://caleydo.org", - "version": "1.0.0-SNAPSHOT", - "author": { - "name": "The Caleydo Team", - "email": "contact@caleydo.org", - "url": "https://caleydo.org" - }, - "license": "BSD-3-Clause", - "bugs": { - "url": "https://github.com/caleydo/taco_server/issues" - }, - "repository": { - "type": "git", - "url": "https://github.com/caleydo/taco_server.git" } } diff --git a/requirements.txt b/requirements.txt index 97dc403..143e6c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ --e git+https://github.com/phovea/phovea_server.git@develop#egg=phovea_server -enum==0.4.6 +phovea_server>=4.0.0,<5.0.0 sklearn==0.0 diff --git a/requirements_dev.txt b/requirements_dev.txt index 7ca3f68..a7a5dbf 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,6 +1,6 @@ -flake8==3.0.4 -pep8-naming==0.4.1 -pytest==3.0.3 -pytest-runner==2.9 -Sphinx==1.5.2 -recommonmark==0.4.0 +flake8==3.5.0 +pep8-naming==0.5.0 +pytest==3.5.0 +pytest-runner==4.2 +Sphinx==1.7.2 +recommonmark==0.6.0 diff --git a/setup.py b/setup.py index 18e10f0..1c311ea 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ # Copyright (c) The Caleydo Team. All rights reserved. # Licensed under the new BSD license, available at http://caleydo.org/license ############################################################################### -from __future__ import with_statement, print_function + from setuptools import setup, find_packages from codecs import open from os import path @@ -26,12 +26,12 @@ def read_it(name): def packaged(*files): r = {} global pkg - r[pkg['name'].encode('ascii')] = list(files) + r[pkg['name']] = list(files) return r def requirements(file): - return [r.strip().encode('ascii') for r in read_it(file).strip().split('\n') if not r.startswith('-e git+https://')] + return [r.strip() for r in read_it(file).strip().split('\n') if not r.startswith('-e git+https://')] def to_version(v): @@ -39,12 +39,14 @@ def to_version(v): now = datetime.datetime.utcnow() return v.replace('SNAPSHOT', now.strftime('%Y%m%d-%H%M%S')) + setup( - name=pkg['name'], + name=pkg['name'].lower(), version=to_version(pkg['version']), url=pkg['homepage'], description=pkg['description'], long_description=read_it('README.md'), + long_description_content_type='text/markdown', keywords=pkg.get('keywords', ''), author=pkg['author']['name'], author_email=pkg['author']['email'], diff --git a/taco_server/__init__.py b/taco_server/__init__.py index 757a7b9..5beac01 100644 --- a/taco_server/__init__.py +++ b/taco_server/__init__.py @@ -11,10 +11,9 @@ def phovea(registry): :param registry: """ # generator-phovea:begin - registry.append('namespace', 'taco', 'taco_server.api', - { - 'namespace': '/api/taco' - }) + registry.append('namespace', 'taco', 'taco_server.api', { + 'namespace': '/api/taco' + }) # generator-phovea:end pass diff --git a/taco_server/api.py b/taco_server/api.py index 84e4e61..5cbda46 100644 --- a/taco_server/api.py +++ b/taco_server/api.py @@ -1,6 +1,6 @@ from phovea_server import ns import timeit -from src import diff_cache +from .src import diff_cache import logging @@ -102,6 +102,7 @@ def create(): """ return app + if __name__ == '__main__': app.debug = True app.run(host='0.0.0.0', port=9000) diff --git a/taco_server/src/diff_cache.py b/taco_server/src/diff_cache.py index 71bf9b7..5b5039b 100644 --- a/taco_server/src/diff_cache.py +++ b/taco_server/src/diff_cache.py @@ -3,12 +3,11 @@ # detail (as detail), middle (as count), overview (as ratios) -from __future__ import print_function -from diff_finder import Table, DiffFinder, Diff, Ratios +from .diff_finder import Table, DiffFinder, Diff, Ratios import phovea_server.dataset as dataset import timeit import json -import pandas.json as ujson +from . import json_encoder import os import hashlib from collections import namedtuple @@ -37,6 +36,7 @@ def create_cache_dir(): else: _log.info('use existing cache directory: ' + _cache_directory) + # run immediately! create_cache_dir() @@ -50,7 +50,7 @@ def get_diff_cache(filename): file_name = _cache_directory + filename + '.json' if os.path.isfile(file_name): with open(file_name) as data_file: - data = ujson.load(data_file) + data = json.load(data_file) return data # if the file doesn't exist return None @@ -100,16 +100,17 @@ def get_diff_table(id1, id2, direction, ops, jsonit=True): if isinstance(diffobj, Diff): # log the detail - json_result = ujson.dumps(diffobj.serialize()) + serialize = Diff.serialize # noqa E121 + json_result = (json.dumps(diffobj.__dict__, cls=json_encoder.JsonEncoder)) set_diff_cache(hash_name, json_result) else: # todo later find a way to send the error # e.g. there's no matching column in this case - json_result = ujson.dumps(diffobj) # which is {} for now! + json_result = json.dumps(diffobj, cls=json_encoder.JsonEncoder) # which is {} for now! set_diff_cache(hash_name, json_result) elif jsonit is False: - diffobj = Diff().unserialize(ujson.loads(json_result)) + diffobj = Diff().unserialize(json.loads(json_result)) if jsonit: return json_result @@ -152,10 +153,10 @@ def get_ratios(id1, id2, direction, ops, bins=1, bins_col=1, jsonit=True): # bin == 1 -> timeline bar chart # bin == -1 -> 2d ratio plot if bins == 1 or bins == -1: - json_ratios = ujson.dumps(ratios.serialize()) + json_ratios = json.dumps(ratios.serialize(), cls=json_encoder.JsonEncoder) # bin > 1 -> 2d ratio histogram else: - json_ratios = ujson.dumps(ratios) + json_ratios = json.dumps(ratios, cls=json_encoder.JsonEncoder) # cache this as overview set_diff_cache(hashname, json_ratios) @@ -198,12 +199,12 @@ def stratify_matrix(m): if row_strat is not None: rowids = list(m.rowids()) row_indices = [rowids.index(o) for o in row_strat.rowids()] - data = data[row_indices, ...] + data = data[row_indices, ...].astype('str') if col_strat is not None: colids = list(m.colids()) col_indices = [colids.index(o) for o in col_strat.rowids()] - data = data[..., col_indices] + data = data[..., col_indices].astype('str') return Table(rows, cols, data) @@ -232,12 +233,12 @@ def create_hashname(id1, id2, bins, bins_col, direction, ops): :return: """ name = str(id1) + '_' + str(id2) + '_' + str(bins) + '_' + str(bins_col) + '_' + str(direction) + '_' + str(ops) - return hashlib.md5(name).hexdigest() + return hashlib.md5(name.encode('utf-8')).hexdigest() def ratio_from_json(jsonobj): # idk - r = json.loads(jsonobj, object_hook=lambda d: namedtuple('X', d.keys())(*d.values())) + r = json.loads(jsonobj, object_hook=lambda d: namedtuple('X', d.keys())(*list(d.values()))) # todo find a smarter way, really cr = 0 if not hasattr(r, "c_ratio") else r.c_ratio ar = 0 if not hasattr(r, "a_ratio") else r.a_ratio diff --git a/taco_server/src/diff_finder.py b/taco_server/src/diff_finder.py index 6470676..13fc5db 100644 --- a/taco_server/src/diff_finder.py +++ b/taco_server/src/diff_finder.py @@ -139,8 +139,8 @@ def generate_diff_from_files(file1, file2): # Table data structure class Table: def __init__(self, rows, cols, content): - self.row_ids = np.asarray(rows, 'object') - self.col_ids = np.asarray(cols, 'object') + self.row_ids = np.asarray(rows, 'object').astype(str) + self.col_ids = np.asarray(cols, 'object').astype(str) self.content = content @@ -172,11 +172,11 @@ def serialize(self): } def unserialize(self, json_obj): - self.content = [] if json_obj['content'] is None else json_obj['content'] - self.structure = {} if json_obj['structure'] is None else json_obj['structure'] - self.merge = {} if json_obj['merge'] is None else json_obj['merge'] - self.reorder = {'rows': [], 'cols': []} if json_obj['reorder'] is None else json_obj['reorder'] - self.union = {} if json_obj['union'] is None else json_obj['union'] + self.content = json_obj['content'] if 'content' in list(json_obj.keys()) else [] + self.structure = json_obj['structure'] if 'structure' in list(json_obj.keys()) else {} + self.merge = json_obj['merge'] if 'merge' in list(json_obj.keys()) else {} + self.reorder = json_obj['reorder'] if 'reorder' in list(json_obj.keys()) else {'rows': [], 'cols': []} + self.union = json_obj['union'] if 'union' in list(json_obj.keys()) else {} return self def content_counts_percell(self): @@ -356,8 +356,8 @@ def reorder_rows_counts(self): :param height: :return: """ - ids = map(lambda r: r['id'], self.reorder['rows']) - filtered_content = filter(lambda r: r['row'] in ids, self.content) + ids = [r['id'] for r in self.reorder['rows']] + filtered_content = [r for r in self.content if r['row'] in ids] return float(len(filtered_content)) def reorder_cols_counts(self): @@ -367,8 +367,8 @@ def reorder_cols_counts(self): :param height: :return: """ - ids = map(lambda r: r['id'], self.reorder['cols']) - filtered_content = filter(lambda r: r['col'] in ids, self.content) + ids = [r['id'] for r in self.reorder['cols']] + filtered_content = [r for r in self.content if r['col'] in ids] return float(len(filtered_content)) def reorder_rows_cols_counts(self): @@ -378,9 +378,9 @@ def reorder_rows_cols_counts(self): :param height: :return: """ - row_ids = map(lambda r: r['id'], self.reorder['rows']) - col_ids = map(lambda r: r['id'], self.reorder['cols']) - filtered_content = filter(lambda r: r['col'] in col_ids and r['row'] in row_ids, self.content) + row_ids = [r['id'] for r in self.reorder['rows']] + col_ids = [r['id'] for r in self.reorder['cols']] + filtered_content = [r for r in self.content if r['col'] in col_ids and r['row'] in row_ids] return float(len(filtered_content)) def reorder_counts(self): @@ -421,7 +421,7 @@ def aggregate(self, bins, bins_col=2): # it's the case of histogram or bar plot result = {} if self._direction == D_ROWS_COLS or self._direction == D_ROWS: - union_rows = self.union['ur_ids'] + union_rows = self.union['ur_ids'] if 'ur_ids' in list(self.union.keys()) else [] max_height = len(union_rows) if bins >= max_height: # this is the case of bar plot @@ -437,7 +437,7 @@ def aggregate(self, bins, bins_col=2): # todo the rows might have different bins number than the cols if self._direction == D_ROWS_COLS or self._direction == D_COLS: # if it's the cols not the rows then switch - union_cols = self.union['uc_ids'] + union_cols = self.union['uc_ids'] if 'uc_ids' in list(self.union.keys()) else [] max_width = len(union_cols) if bins_col >= max_width: # todo handle the > alone or? @@ -479,7 +479,7 @@ def per_bin_ratios(self, bins, e_type): index2bin = np.digitize(indices, bin_range) # todo handle the error here when there's no row ! - pcontent = [[] for x in xrange(bins)] + pcontent = [[] for x in range(bins)] for c in self.content: ci = union_rows.index(c[row]) bin_index = index2bin[ci] @@ -489,7 +489,7 @@ def per_bin_ratios(self, bins, e_type): pcontent[bin_index] += [c] # for structure changes - pstructure = [{"added_" + e_type: [], "deleted_" + e_type: []} for x in xrange(bins)] + pstructure = [{"added_" + e_type: [], "deleted_" + e_type: []} for x in range(bins)] # filter for the structure changes, because once there's a structure change, there's no need to find content #what!! for a in self.structure["added_" + e_type]: ai = union_rows.index(a['id']) @@ -508,8 +508,8 @@ def per_bin_ratios(self, bins, e_type): # convert to np.array to use np.where union_rows = np.array(union_rows) - for i in xrange(bins): - temp = union_rows[np.where(index2bin == i)[0]] + for i in range(bins): + temp = union_rows[np.where(index2bin == i)[0]].astype('str').tolist() if dir == D_ROWS: punion = { "ur_ids": temp, @@ -540,15 +540,15 @@ def per_entity_ratios(self, dir): # get a partial diff where every row is a diff # 1. Partition # get the direction - union_rows = self.union['ur_ids'] - union_cols = self.union['uc_ids'] + union_rows = self.union['ur_ids'] if 'ur_ids' in list(self.union.keys()) else [] + union_cols = self.union['uc_ids'] if 'uc_ids' in list(self.union.keys()) else [] e_type = "rows" row_id = "row" if dir == D_COLS: # if it's the cols not the rows then switch - union_rows = self.union['uc_ids'] - union_cols = self.union['ur_ids'] + union_rows = self.union['uc_ids'] if 'uc_ids' in list(self.union.keys()) else [] + union_cols = self.union['ur_ids'] if 'ur_ids' in list(self.union.keys()) else [] # todo handle the case of both rows and columns e_type = "cols" row_id = "col" @@ -565,18 +565,18 @@ def per_entity_ratios(self, dir): pstructure = {} # filter for the structure changes, because once there's a structure change, there's no need to find content # idk why but obj is Diff! - pstructure["added_" + e_type] = filter(lambda obj: obj['id'] == id, self.structure["added_" + e_type]) + pstructure["added_" + e_type] = [obj for obj in self.structure["added_" + e_type] if obj['id'] == id] if len(pstructure["added_" + e_type]) != 0: # create a ratio where it's only added ratio_counts = RatiosAndCounts(Ratios(0, 1, 0, 0), Counts(0, len(union_cols), 0, 0)) else: # find the deleted - pstructure["deleted_" + e_type] = filter(lambda obj: obj['id'] == id, self.structure["deleted_" + e_type]) + pstructure["deleted_" + e_type] = [obj for obj in self.structure["deleted_" + e_type] if obj['id'] == id] if len(pstructure["deleted_" + e_type]) != 0: ratio_counts = RatiosAndCounts(Ratios(0, 0, 1, 0), Counts(0, 0, len(union_cols), 0)) else: # find the content - pcontent = filter(lambda obj: obj[row_id] == id, self.content) + pcontent = [obj for obj in self.content if obj[row_id] == id] if len(pcontent) == 0: pcontent = None # more resonable in the case of subtable @@ -740,7 +740,7 @@ def __init__(self, t1, t2, rowtype, coltype, direction): self.diff = Diff(direction=self._direction) self.union = {} self.intersection = {} # we only need this for rows when we have content changes - self.intersection["ic_ids"] = get_intersection(self._table1.col_ids, self._table2.col_ids) + self.intersection["ic_ids"] = get_intersection(self._table1.col_ids, self._table2.col_ids.astype(str)) if self.intersection["ic_ids"].shape[0] > 0: # there's at least one common column between the tables # otherwise there's no need to calculate the unions @@ -838,7 +838,7 @@ def _compare_ids(self, e_type, ids1, ids2, u_ids, has_merge, has_structure, merg merged_ids = str(j).split(merge_delimiter) for s in merged_ids: # delete the delete operations related to those IDs - deleted_log = filter(lambda obj: obj['id'] != s, deleted_log) + deleted_log = [obj for obj in deleted_log if obj['id'] != s] merged_log += [{"id": s, "pos": np.where(u_ids == s)[0][0], "merge_id": merge_id, "is_added": False}] merge_id += 1 # increment it # log @@ -872,6 +872,7 @@ def _compare_values1(self): # @disordered is an array of the IDs that are available in x and not in the matching position in y (or not available at all) # in case x and y are a result of the intersection then disordered is the list of disordered IDs in x def _find_reorder(self, ids1, ids2, x, y, disordered, direction): + import numpy # todo this should be as the size of the original ids not just the intesection ids # x shape or y shape should be the same # or the shape of the IDs in the second table (original y) @@ -879,20 +880,25 @@ def _find_reorder(self, ids1, ids2, x, y, disordered, direction): reordered = [] for i in disordered: # todo check this with more than 2 changes - pos_table1 = np.where(ids1 == i)[0][0] - pos_table2 = np.where(ids2 == i)[0][0] + if isinstance(i, numpy.ndarray): + i = i[0] + try: + pos_table1 = np.where(ids1 == i)[0][0] + pos_table2 = np.where(ids2 == i)[0][0] + except IndexError: + print('index error') # todo substitute this with the new one! reordered.append({'id': i, 'from': pos_table1, 'to': pos_table2, 'diff': pos_table2 - pos_table1}) old = np.where(x == i)[0][0] new = np.where(y == i)[0][0] np.put(indices, old, new) - # index = [] - # for i in x: - # if i != y[np.where(x == i)[0][0]]: - # index += [np.where(y == i)[0][0]] - # else: - # index += [np.where(x == i)[0][0]] + # index = [] + # for i in x: + # if i != y[np.where(x == i)[0][0]]: + # index += [np.where(y == i)[0][0]] + # else: + # index += [np.where(x == i)[0][0]] self._reorder_to_json(direction, reordered) return indices @@ -934,10 +940,10 @@ def _compare_values(self): try: cdis = cids1[cids1 != cids2] except ValueError: - # fixing an ungly bug when there are NO unique ids! + # fixing an ungly bug when there are NO unique ids! # ## warning! bug ### # this happens when one of the tables does NOT have unique ids and the sizes are different... couldn't fix - print("Oops! it seems that sizes are not matching", cids1.shape[0], cids2.shape[0]) + print(("Oops! it seems that sizes are not matching", cids1.shape[0], cids2.shape[0])) set_boolean = (np.array(list(set(cids1))) != np.array(list(set(cids2)))) cdis = cids1[set_boolean] # ignore and leave @@ -949,7 +955,7 @@ def _compare_values(self): inter2 = inter2[:, c_indices] # at this point inter2 should look good hopefully! # diff work - diff = inter2 - inter1 + diff = inter2.astype('float') - inter1.astype('float') # done :) # normalization normalized_diff = normalize_float_11(diff) diff --git a/taco_server/src/generator.py b/taco_server/src/generator.py index eef2526..817ea19 100644 --- a/taco_server/src/generator.py +++ b/taco_server/src/generator.py @@ -6,12 +6,12 @@ # creates an array with random float values within a range with size def random_floats_array(low, high, size): - return [random.uniform(low, high) for _ in xrange(size)] + return [random.uniform(low, high) for _ in range(size)] # creates an array with random int values within a range with size def random_int_array(low, high, size): - return [random.randint(low, high) for _ in xrange(size)] + return [random.randint(low, high) for _ in range(size)] # creates a homogeneous table diff --git a/taco_server/src/json_encoder.py b/taco_server/src/json_encoder.py new file mode 100644 index 0000000..124878c --- /dev/null +++ b/taco_server/src/json_encoder.py @@ -0,0 +1,23 @@ +""" +Serializer to avoid default usage of numpy integer/float/bytes/etc. +""" +import json +import numpy + + +class JsonEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, numpy.integer): + return int(obj) + elif isinstance(obj, numpy.int64): + return int(obj) + elif isinstance(obj, bytes): + return obj.decode('utf-8') + elif isinstance(obj, numpy.bytes): + return obj.decode('utf-8') + elif isinstance(obj, numpy.floating): + return float(obj) + elif isinstance(obj, numpy.ndarray): + return obj.astype(str) + else: + return super(JsonEncoder, self).default(obj) diff --git a/taco_server/src/modifier.py b/taco_server/src/modifier.py index f69f3dc..5a03ceb 100644 --- a/taco_server/src/modifier.py +++ b/taco_server/src/modifier.py @@ -1,7 +1,7 @@ import numpy as np import random -import generator as gen -import logger as log +from . import generator as gen +from . import logger as log __author__ = 'Reem' @@ -44,7 +44,7 @@ def del_row(my_array, index): array_length = len(my_array) # check if the table is empty if array_length == 0: - print("Error: list is empty, can't delete a row", index) + print(("Error: list is empty, can't delete a row", index)) return my_array else: if index < array_length: @@ -58,7 +58,7 @@ def del_col(my_array, index): array_length = len(my_array) # check if the table is empty if array_length == 0: - print("Error: list is empty, can't delete a col", index) + print(("Error: list is empty, can't delete a col", index)) return my_array else: row_length = len(my_array[0]) @@ -165,7 +165,7 @@ def merge_columns(full_table, merge_array): # update the IDs col_ids.insert(merge_array[0], merged_id) log.message("merge", "column", merged_id, merge_array) - print(merged_id, cols, merged_col, table) + print((merged_id, cols, merged_col, table)) return {"table": table, "col_ids": col_ids, "row_ids": row_ids} @@ -189,7 +189,7 @@ def merge_rows(full_table, merge_array): # update the IDs row_ids.insert(merge_array[0], merged_id) log.message("merge", "row", merged_id, merge_array) - print(merged_id, rows, merged_row, table) + print((merged_id, rows, merged_row, table)) return {"table": table, "col_ids": col_ids, "row_ids": row_ids} @@ -202,27 +202,27 @@ def change_table(full_table, min_data, max_data, operations): new_row_id = latest_row_id + 1 new_col_id = latest_col_id + 1 # first delete the rows - for r in xrange(operations['del_row']): + for r in range(operations['del_row']): full_table = randomly_change_table(full_table, min_data, max_data, DEL_ROW) # then delete the cols - for c in xrange(operations['del_col']): + for c in range(operations['del_col']): full_table = randomly_change_table(full_table, min_data, max_data, DEL_COL) # then add rows - for r in xrange(operations['add_row']): + for r in range(operations['add_row']): full_table = randomly_change_table(full_table, min_data, max_data, ADD_ROW, new_row_id) new_row_id += 1 # then add cols - for c in xrange(operations['add_col']): + for c in range(operations['add_col']): full_table = randomly_change_table(full_table, min_data, max_data, ADD_COL, new_col_id) new_col_id += 1 # finally change the cells - for c in xrange(operations['ch_cell']): + for c in range(operations['ch_cell']): full_table = randomly_change_table(full_table, min_data, max_data, CH_CELL) # merge operation # the order of this operation might change later for mc in operations['me_col']: # full_table = merge_col(full_table) - print ('merge col', mc) + print(('merge col', mc)) full_table = merge_columns(full_table, mc) for mr in operations['me_row']: full_table = merge_rows(full_table, mr) @@ -277,7 +277,7 @@ def change_table(full_table, min_data, max_data, operations): gen.save_table(result['table'], result['row_ids'], result['col_ids'], data_directory + file_name + str(i + 1) + '_out.csv') # just print the size to add it manually to index.json - print (result['table'].shape[0], result['table'].shape[1], i) + print((result['table'].shape[0], result['table'].shape[1], i)) # update the ... for next round operations_count = { 'del_row': random.randint(0, 25), diff --git a/taco_server/src/test1.py b/taco_server/src/test1.py index 9f632f0..ecb97ae 100644 --- a/taco_server/src/test1.py +++ b/taco_server/src/test1.py @@ -39,7 +39,7 @@ def del_row(my_array, index): array_length = len(my_array) # check if the table is empty if array_length == 0: - print("Error: list is empty, can't delete a row", index) + print(("Error: list is empty, can't delete a row", index)) return my_array else: if index < array_length: @@ -53,7 +53,7 @@ def del_col(my_array, index): array_length = len(my_array) # check if the table is empty if array_length == 0: - print("Error: list is empty, can't delete a row", index) + print(("Error: list is empty, can't delete a row", index)) return my_array else: row_length = len(my_array[0]) @@ -84,7 +84,7 @@ def randomly_change_table(table): else: # table is empty new_row = random.sample(range(min_data, max_data), random.randint(1, largest_row)) - print("log: add a row in ", index, new_row) + print(("log: add a row in ", index, new_row)) table = add_row(table, index, new_row) elif change_type == ADD_COL: if len(table) > 0: @@ -93,24 +93,24 @@ def randomly_change_table(table): else: index = 0 new_col = random.sample(range(min_data, max_data), random.randint(1, largest_col)) - print("log: add a col in ", index, new_col) + print(("log: add a col in ", index, new_col)) table = add_col(table, index, new_col) elif change_type == CH_CELL: if len(table) > 0: i = random.randint(0, len(table) - 1) j = random.randint(0, len(table[0]) - 1) new_value = random.uniform(min_data, max_data) - print("log: change something somewhere ", i, j, new_value) + print(("log: change something somewhere ", i, j, new_value)) table = change_cell(table, i, j, new_value) else: print("log: there's nothing to change") elif change_type == DEL_ROW: index = random.randint(0, len(table) - 1) - print("log: delete row ", index) + print(("log: delete row ", index)) table = del_row(table, index) elif change_type == DEL_COL: index = random.randint(0, len(table[0]) - 1) - print("log: delete col ", index) + print(("log: delete col ", index)) table = del_col(table, index) return table @@ -152,13 +152,13 @@ def randomly_change_table(table): # table_3 might be from a file as it has to be big input_file = '../../data/small_table_in.csv' my_date = np.genfromtxt(input_file, delimiter=',') -print("this is my data", my_date) +print(("this is my data", my_date)) output_file = "../../data/small_table_out.csv" random.seed(100) num_of_changes = random.randint(2, 20) -print("num of changes is ", num_of_changes - 1) -for i in xrange(1, num_of_changes): +print(("num of changes is ", num_of_changes - 1)) +for i in range(1, num_of_changes): my_date = randomly_change_table(my_date) print(my_date) # print(table_2) diff --git a/tox.ini b/tox.ini index f2734b2..f79fd82 100644 --- a/tox.ini +++ b/tox.ini @@ -5,12 +5,11 @@ ############################################################################### [tox] -envlist = py{27,34} +envlist = py{37} [testenv] basepython = - py27: python2.7 - py34: python3.4 + py34: python3.7 deps = flake8 pytest @@ -21,7 +20,7 @@ commands = py.test tests [flake8] -ignore=E111,E114,E501 +ignore=E111,E114,E501,E121,E123,E126,E226,E24,E704 exclude = .tox,*.egg,build,data,.git,__pycache__,docs,node_modules [pytest]