From 61d82bfe355a557a204a5da43fe1a2a603741964 Mon Sep 17 00:00:00 2001 From: Shripad Date: Wed, 2 Oct 2019 21:43:55 +0900 Subject: [PATCH 01/23] Initial commit for python egg for library of api requester Remove private access token --- .gitconfig | 9 +++ README.adoc | 35 +++++++++ README.md | 2 - build.py | 17 +++++ requirements.txt | 1 + setup.py | 74 +++++++++++++++++++ src/main/python/apirequester/__init__.py | 0 .../python/apirequester/githubrequester.py | 29 ++++++++ src/main/scripts/mlt_github_requester | 43 +++++++++++ 9 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 .gitconfig create mode 100644 README.adoc delete mode 100644 README.md create mode 100644 build.py create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 src/main/python/apirequester/__init__.py create mode 100644 src/main/python/apirequester/githubrequester.py create mode 100644 src/main/scripts/mlt_github_requester diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 0000000..7b84997 --- /dev/null +++ b/.gitconfig @@ -0,0 +1,9 @@ +build +*.pyc +target +*.ipynb +.ipynb_checkpoints +.DS_Store +sync-config.json +data +.vscode diff --git a/README.adoc b/README.adoc new file mode 100644 index 0000000..8c9e50b --- /dev/null +++ b/README.adoc @@ -0,0 +1,35 @@ +# search-api-requester +API requester for recommendation system + +## Development +Checkout github repository and change directory to repository + +Please set your github_access_token in ```src/main/scripts/mlt_github_requester``` for initial development and testing execution as this is initial code and will be updated later + +create conda environment change to that environment + +``` +conda create -n py36 python=3.6 +source activate py36 +``` + +Intsall pybuilder + +``` +pip install pybuilder +``` + +Execute following command to build the python egg + +``` +pyb && pyb -X && easy_install target/dist/*/dist/*.tar.gz +``` + +Testing example + +``` +mlt_github_requester -k 'machine learning' +``` + +## Notes +Please check the ```src/main/scripts/mlt_github_requester``` and ```src/main/python/apirequester/githubrequester.py``` for example and also when testing, please build the python egg ```pyb && pyb -X && easy_install target/dist/*/dist/*.tar.gz``` so that egg will be build for testing. Please ask if any question. Still under development so not the production level code. pypi installation setup will be done later diff --git a/README.md b/README.md deleted file mode 100644 index 8e303c2..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# search-api-requester -API requester for recommendation system diff --git a/build.py b/build.py new file mode 100644 index 0000000..8ddf996 --- /dev/null +++ b/build.py @@ -0,0 +1,17 @@ +from pybuilder.core import use_plugin, init + +use_plugin("python.core") +#use_plugin("python.unittest") +use_plugin("python.install_dependencies") +use_plugin("python.flake8") +#use_plugin("python.coverage") +use_plugin("python.distutils") + +requires_python = "==3.6.9" +name = "search-api-requester" +default_task = ["install_dependencies", "publish"] + + +@init +def set_properties(project): + project.depends_on_requirements("requirements.txt") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2b41155 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +PyGithub==1.43.8 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..495b355 --- /dev/null +++ b/setup.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# + +# -*- coding: utf-8 -*- +# +# This file is part of PyBuilder +# +# Copyright 2011-2015 PyBuilder Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This script allows to support installation via: +# pip install git+git://@ +# +# This script is designed to be used in combination with `pip install` ONLY +# +# DO NOT RUN MANUALLY +# + +import os +import subprocess +import sys +import glob +import shutil + +from sys import version_info +py3 = version_info[0] == 3 +py2 = not py3 +if py2: + FileNotFoundError = OSError + +script_dir = os.path.dirname(os.path.realpath(__file__)) +exit_code = 0 +try: + subprocess.check_call(["pyb", "--version"]) +except FileNotFoundError as e: + if py3 or py2 and e.errno == 2: + try: + subprocess.check_call([sys.executable, "-m", "pip.__main__", "install", "pybuilder"]) + except subprocess.CalledProcessError as e: + sys.exit(e.returncode) + else: + raise +except subprocess.CalledProcessError as e: + sys.exit(e.returncode) + +try: + subprocess.check_call(["pyb", "clean", "install_build_dependencies", "package", "-o"]) + dist_dir = glob.glob(os.path.join(script_dir, "target", "dist", "*"))[0] + for src_file in glob.glob(os.path.join(dist_dir, "*")): + file_name = os.path.basename(src_file) + target_file_name = os.path.join(script_dir, file_name) + if os.path.exists(target_file_name): + if os.path.isdir(target_file_name): + shutil.rmtree(target_file_name) + else: + os.remove(target_file_name) + shutil.move(src_file, script_dir) + setup_args = sys.argv[1:] + subprocess.check_call([sys.executable, "setup.py"] + setup_args, cwd=script_dir) +except subprocess.CalledProcessError as e: + exit_code = e.returncode +sys.exit(exit_code) diff --git a/src/main/python/apirequester/__init__.py b/src/main/python/apirequester/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/main/python/apirequester/githubrequester.py b/src/main/python/apirequester/githubrequester.py new file mode 100644 index 0000000..88eae2f --- /dev/null +++ b/src/main/python/apirequester/githubrequester.py @@ -0,0 +1,29 @@ +from github import Github + + +def get_response(github, keywords): + query = '+'.join(keywords) + '+in:readme+in:description' + result = github.search_repositories(query, 'stars', 'desc') + + print(f'Found {result.totalCount} repo(s)') + + count = 100 + + github_result = {} + + for repo in result: + url = repo.clone_url.replace('.git', '') + labels = repo.get_labels() + stars = repo.stargazers_count + github_result[url] = stars + count -= 1 + if count <= 0: + return github_result + + +def github_requester(access_token, keywords): + github = Github(access_token) + keywords = [keyword.strip() for keyword in keywords.split(',')] + return get_response(github, keywords) + + diff --git a/src/main/scripts/mlt_github_requester b/src/main/scripts/mlt_github_requester new file mode 100644 index 0000000..5946693 --- /dev/null +++ b/src/main/scripts/mlt_github_requester @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import os +import sys +import getopt + +from apirequester.githubrequester import github_requester + +def get_responses(github_access_token, keywords): + github_response = github_requester(github_access_token, keywords) + + for url, stars in github_response.items(): + print(f'{url}, {stars} stars') + + +def main(argv): + github_access_token = '' # please set here your github_access_token + keywords = '' + + try: + opts, argv = getopt.getopt(argv, "hga:k", ["github_access_token=", "keywords="]) + + except getopt.GetoptError: + print("mlt_github_requester -ga -k ") + sys.exit(2) + + for opt, arg in opts: + if opt == "-h": + print("mlt_github_requester -ga -k ") + elif opt in ("-ga", "github_access_token="): + github_access_token = arg + elif opt in ("-k", "keywords"): + keywords = ' '.join(argv) + + get_responses(github_access_token, keywords) + + + + +if __name__ == "__main__": + main(sys.argv[1:]) \ No newline at end of file From c724551e62bf6007fdc0934cbe0c4df9b36d05ae Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sat, 18 Jan 2020 17:09:06 +0900 Subject: [PATCH 02/23] Adding the functionalities to mlsearch package. --- .gitignore | 6 + README.md | 50 ++++++ build.py | 13 +- .../apirequester/__init__.py => docs/.gitkeep | 0 requirements.txt | 3 +- setup.py | 108 +++++------- .../python/apirequester/githubrequester.py | 29 --- src/main/python/mlsearch/__init__.py | 2 + src/main/python/mlsearch/api_requester.py | 166 ++++++++++++++++++ src/main/python/mlsearch/config.py | 16 ++ src/main/python/mlsearch/helper.py | 49 ++++++ src/main/python/mlsearch/protocol.py | 61 +++++++ src/main/scripts/mlsearch | 69 ++++++++ src/main/scripts/mlt_github_requester | 43 ----- src/unittest/python/myproject_tests.py | 13 ++ 15 files changed, 483 insertions(+), 145 deletions(-) create mode 100644 .gitignore create mode 100644 README.md rename src/main/python/apirequester/__init__.py => docs/.gitkeep (100%) mode change 100644 => 100755 setup.py delete mode 100644 src/main/python/apirequester/githubrequester.py create mode 100644 src/main/python/mlsearch/__init__.py create mode 100644 src/main/python/mlsearch/api_requester.py create mode 100644 src/main/python/mlsearch/config.py create mode 100644 src/main/python/mlsearch/helper.py create mode 100644 src/main/python/mlsearch/protocol.py create mode 100644 src/main/scripts/mlsearch delete mode 100644 src/main/scripts/mlt_github_requester create mode 100644 src/unittest/python/myproject_tests.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9e4dacc --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +/src/main/python/mlsearch/__pycache__ +*/unittest/python/__pycache__ +/target +__pycache__ +.coverage +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa15a26 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# MLSearch Libraries + +The mlsearch libraries are a collection of library that facilite as a wrapper over other repositories for fetching the data required for MLSearch Engine. + +The package could be install by `python setup.py`. + +
+Standalone usage + +`mlsearch -q query -i start_index -c number_of_result -s source` + +For example +`mlsearch -q "cnn" -i 0 -c 3 -s "github"` + +Available Parameters: +``` + -h, --help show this help message and exit + +Required Parameters: + + -q QUERY, --query QUERY + Keyword for searching. + -i INIT_IDX, --init_idx INIT_IDX + Initial index for pagination. + -c COUNT, --count COUNT + Total number of results to be fetched. + -s SOURCE, --source SOURCE + Source API to be looking for. + +Optional Parameters: + + -u PWC_USER, --pwc_user PWC_USER + Paper with code repository user name. + -p PWC_PASSWORD, --pwc_password PWC_PASSWORD + Paper with code repository password. + -t GITHUB_ACC_TOKEN, --github_acc_token GITHUB_ACC_TOKEN + Github access token. +``` + +
+Using as an API +
+```python +from mlsearch.api_requester import APIRequest + +api_request = APIRequest(source, query, + init_idx, count) +api_request.pwc_auth_info = ('user_name', 'password') +api_request.github_acc_token = 'token' +``` \ No newline at end of file diff --git a/build.py b/build.py index 8ddf996..cb91233 100644 --- a/build.py +++ b/build.py @@ -1,17 +1,20 @@ from pybuilder.core import use_plugin, init use_plugin("python.core") -#use_plugin("python.unittest") +use_plugin("python.unittest") use_plugin("python.install_dependencies") use_plugin("python.flake8") -#use_plugin("python.coverage") +use_plugin("python.coverage") use_plugin("python.distutils") -requires_python = "==3.6.9" -name = "search-api-requester" + +name = "mlsearch" default_task = ["install_dependencies", "publish"] @init def set_properties(project): - project.depends_on_requirements("requirements.txt") + project.set_property("coverage_break_build", False) + project.build_depends_on("mock") + project.build_depends_on("requests") + project.build_depends_on("pygithub") diff --git a/src/main/python/apirequester/__init__.py b/docs/.gitkeep similarity index 100% rename from src/main/python/apirequester/__init__.py rename to docs/.gitkeep diff --git a/requirements.txt b/requirements.txt index 2b41155..656236f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -PyGithub==1.43.8 \ No newline at end of file +PyGithub==1.43.8 +pybuilder \ No newline at end of file diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 495b355..b25925b --- a/setup.py +++ b/setup.py @@ -1,74 +1,48 @@ #!/usr/bin/env python -# -# -*- coding: utf-8 -*- -# -# This file is part of PyBuilder -# -# Copyright 2011-2015 PyBuilder Team -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +from setuptools import setup +from setuptools.command.install import install as _install -# -# This script allows to support installation via: -# pip install git+git://@ -# -# This script is designed to be used in combination with `pip install` ONLY -# -# DO NOT RUN MANUALLY -# +class install(_install): + def pre_install_script(self): + pass -import os -import subprocess -import sys -import glob -import shutil + def post_install_script(self): + pass -from sys import version_info -py3 = version_info[0] == 3 -py2 = not py3 -if py2: - FileNotFoundError = OSError + def run(self): + self.pre_install_script() -script_dir = os.path.dirname(os.path.realpath(__file__)) -exit_code = 0 -try: - subprocess.check_call(["pyb", "--version"]) -except FileNotFoundError as e: - if py3 or py2 and e.errno == 2: - try: - subprocess.check_call([sys.executable, "-m", "pip.__main__", "install", "pybuilder"]) - except subprocess.CalledProcessError as e: - sys.exit(e.returncode) - else: - raise -except subprocess.CalledProcessError as e: - sys.exit(e.returncode) + _install.run(self) -try: - subprocess.check_call(["pyb", "clean", "install_build_dependencies", "package", "-o"]) - dist_dir = glob.glob(os.path.join(script_dir, "target", "dist", "*"))[0] - for src_file in glob.glob(os.path.join(dist_dir, "*")): - file_name = os.path.basename(src_file) - target_file_name = os.path.join(script_dir, file_name) - if os.path.exists(target_file_name): - if os.path.isdir(target_file_name): - shutil.rmtree(target_file_name) - else: - os.remove(target_file_name) - shutil.move(src_file, script_dir) - setup_args = sys.argv[1:] - subprocess.check_call([sys.executable, "setup.py"] + setup_args, cwd=script_dir) -except subprocess.CalledProcessError as e: - exit_code = e.returncode -sys.exit(exit_code) + self.post_install_script() + +if __name__ == '__main__': + setup( + name = 'mlsearch', + version = '1.0.dev0', + description = '', + long_description = '', + author = '', + author_email = '', + license = '', + url = '', + scripts = ['scripts/mlsearch'], + packages = ['mlsearch'], + namespace_packages = [], + py_modules = [], + classifiers = [ + 'Development Status :: 3 - Alpha', + 'Programming Language :: Python' + ], + entry_points = {}, + data_files = [], + package_data = {}, + install_requires = [], + dependency_links = [], + zip_safe = True, + cmdclass = {'install': install}, + keywords = '', + python_requires = '', + obsoletes = [], + ) diff --git a/src/main/python/apirequester/githubrequester.py b/src/main/python/apirequester/githubrequester.py deleted file mode 100644 index 88eae2f..0000000 --- a/src/main/python/apirequester/githubrequester.py +++ /dev/null @@ -1,29 +0,0 @@ -from github import Github - - -def get_response(github, keywords): - query = '+'.join(keywords) + '+in:readme+in:description' - result = github.search_repositories(query, 'stars', 'desc') - - print(f'Found {result.totalCount} repo(s)') - - count = 100 - - github_result = {} - - for repo in result: - url = repo.clone_url.replace('.git', '') - labels = repo.get_labels() - stars = repo.stargazers_count - github_result[url] = stars - count -= 1 - if count <= 0: - return github_result - - -def github_requester(access_token, keywords): - github = Github(access_token) - keywords = [keyword.strip() for keyword in keywords.split(',')] - return get_response(github, keywords) - - diff --git a/src/main/python/mlsearch/__init__.py b/src/main/python/mlsearch/__init__.py new file mode 100644 index 0000000..669354b --- /dev/null +++ b/src/main/python/mlsearch/__init__.py @@ -0,0 +1,2 @@ +def greet(filelike): + filelike.write("Hello world!\n") \ No newline at end of file diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py new file mode 100644 index 0000000..442ee0d --- /dev/null +++ b/src/main/python/mlsearch/api_requester.py @@ -0,0 +1,166 @@ +from mlsearch.config import Config +from mlsearch.protocol import Protocol +from github import Github +import json +# import scholarly + +try: + from botocore.vendored import requests + from botocore.vendored.requests.auth import HTTPBasicAuth +except ModuleNotFoundError: + import requests + from requests.auth import HTTPBasicAuth + + +class APIRequest(): + """For handling the different Valid API requests.""" + + def __init__(self, source, query, init_idx, count): + """ + Initialization for the class. + + :param source: The API request destination. + :param query: The query for searching. + :param init_idx: The initial pagination index. + :param count: The number of records to be fetched. + """ + + self.params = {'query':query, 'init_idx':init_idx, + 'count':count, 'source': source} + self.params_model = {'query':str, 'init_idx':int, + 'count':int} + # Load the configuration file + self._config = Config + # Validate Params + self._validate_params() + # Response data + self.data = {'response_code': 201, 'content': None} + + @property + def github_acc_token(self): + return self._config.GITHUB_ACC_TOKEN + + @github_acc_token.setter + def github_acc_token(self, access_token): + if access_token: + self._config.GITHUB_ACC_TOKEN = access_token + + + @property + def pwc_auth_info(self): + return (self._config.PWC_USER_NAME, self._config.PWC_PASSWORD) + + @pwc_auth_info.setter + def pwc_auth_info(self, auth_info: "tuple(user_name, password)"): + assert isinstance(auth_info, tuple), \ + f"Invalid type for auth_info. Expected tuple but got {type(auth_info)}." + if len(auth_info) == 2: + assert isinstance(auth_info[0], str), \ + f"Invalid type for user_name. Expected str but got {type(auth_info[0])}." + assert isinstance(auth_info[1], str), \ + f"Invalid type for password. Expected str but got {type(auth_info[1])}." + self._config.PWC_USER_NAME = auth_info[0] + self._config.PWC_PASSWORD = auth_info[1] + else: + raise AttributeError(f"Expected tuple with length 2 but got {len(auth_info)}.") + + def _validate_params(self): + """Validate user input data.""" + + for item, typ in self.params_model.items(): + if item in self.params.keys(): + if not typ == type(self.params[item]): + raise TypeError( + f'Invalid type for {item}. {typ} is expected but ' + f'{type(self.params[item])} is given.') + + if self.params['source'] not in self._config.VALID_API_SOURCE: + raise ValueError( + f"Invalid value for {self.params['source']}. " + f"Expected values are {self._config.VALID_API_SOURCE}") + + def _fetch_github(self) -> [Protocol]: + """Fetch Github Repository""" + + github = Github(self._config.GITHUB_ACC_TOKEN) + query = '+'.join([self.params['query'], self._config.GITHUB_URL]) + responses = github.search_repositories(query, 'stars', 'desc') + results = [] + + for response in responses[ + self.params['init_idx']:self.params['init_idx'] + \ + self.params['count']]: + + data = { + 'repository_url' : response.clone_url.replace('.git', ''), + 'title' : response.name, + 'description' : response.description, + 'private' : response.private, + 'fork' : response.fork, + 'updated_at' : response.updated_at.strftime("%Y%m%dT%H:%M:%S"), + 'stargazers_count' : response.stargazers_count, + 'watchers_count' : response.watchers_count, + 'language' : response.language, + 'forks_count' : response.forks_count, + 'source' : self.params.get('source', '') + } + results.append(Protocol(data)) + + self.data['response_code'] = 200 + self.data['content'] = [proto.to_JSON() for proto in results] + + def _fetch_paperwithcode(self) -> [Protocol]: + """Fetch Paper with Code Repository""" + + results = [] + url = f"{self._config.PWC_URL}{self.params['query']}" + query_result = requests.get(url, + auth=HTTPBasicAuth(self._config.PWC_USER_NAME, + self._config.PWC_PASSWORD)) + + if query_result.status_code == 200: + content = json.loads(query_result.content) + content = content[self.params['init_idx']:self.params['init_idx'] + \ + self.params['count']] + + for item in content: + data = { + 'title': item.get('paper_title', None), + 'description': item.get('paper_abstract', None), + 'paper_url': item.get('paper_url', None), + 'num_of_implementations': item.get('num_of_implementations', None), + 'tasks': item.get('tasks', None), + 'paper_conference': item.get('paper_conference', None), + 'repository_url': item.get('repository_url', None), + 'repository_name': item.get('repository_name', None), + 'repository_framework': item.get('repository_framework', None), + 'repository_stars': item.get('repository_stars', None), + 'paper_published': item.get('paper_published', None), + 'source': self.params.get('source', '') + } + results.append(Protocol(data)) + + self.data['content'] = [proto.to_JSON() for proto in results] + + self.data['response_code'] = query_result.status_code + + def fetch_data(self) -> json: + """Fetch the data from designated API source.""" + + try: + if self.params.get('source', '') == 'paperwithcode': + self._fetch_paperwithcode() + + if self.params.get('source', '') == 'github': + responses = self._fetch_github() + + # TODO: Implement the function for Coursera. However, this function + # may be handled by the backend server. + if self.params.get('source', '') == 'coursera': + pass + + except Exception as ex: + self.data['response_code'] = 500 + self.data['content'] = str(ex) + + return self.data \ No newline at end of file diff --git a/src/main/python/mlsearch/config.py b/src/main/python/mlsearch/config.py new file mode 100644 index 0000000..fdbc95a --- /dev/null +++ b/src/main/python/mlsearch/config.py @@ -0,0 +1,16 @@ +import os + +class Config(object): + """Class for API Request configuration.""" + + # Paper with code configuration + PWC_USER_NAME = os.environ.get('PWC_USER_NAME') or '' + PWC_PASSWORD = os.environ.get('PWC_PASSWORD') or '' + PWC_URL = os.environ.get('PWC_URL') or "https://paperswithcode.com/api/v0/search/?q=" + + # Github configuration + GITHUB_ACC_TOKEN = os.environ.get('GITHUB_ACC_TOKEN') or None + GITHUB_URL = os.environ.get('GITHUB_URL') or "in:readme+in:description" + + # AIP Source + VALID_API_SOURCE = ['paperwithcode', 'github', 'coursera'] \ No newline at end of file diff --git a/src/main/python/mlsearch/helper.py b/src/main/python/mlsearch/helper.py new file mode 100644 index 0000000..e019032 --- /dev/null +++ b/src/main/python/mlsearch/helper.py @@ -0,0 +1,49 @@ +def is_valid_parameters(event, param_names): + """ + Check whether the item in param_names exist in event dictionary. + + :param event: Lambda event object. + :param param_names: The list of the param names to be checked. + + :retrun: True if exist else False + """ + for param in param_names: + if not param in event: + return False + return True + +def response(message, status_code): + """ + Response message for the request. + + :param message: The response message. + :param status_code: The response status. + + :return: The dic('statusCode', 'body') + """ + return { + 'statusCode': status_code, + 'body': message + } + +def parse_parameters(event): + """ + Parse the parameters from event dictionary. + + :param event: The event dictionary. + :return: dict('query', 'init_idx', 'count') + """ + try: + param = dict() + param['query'] = event['query'] + param['init_idx'] = int(event['init_idx']) + param['count'] = int(event['count']) + param['source'] = event['source'] + + if param['init_idx'] >= 0 and param['count'] > 0: + return param + else: + return dict() + + except: + return dict() \ No newline at end of file diff --git a/src/main/python/mlsearch/protocol.py b/src/main/python/mlsearch/protocol.py new file mode 100644 index 0000000..efc2d28 --- /dev/null +++ b/src/main/python/mlsearch/protocol.py @@ -0,0 +1,61 @@ +class Protocol(): + """The Protocol for standard communication accross different api sources.""" + + def __init__(self, kwargs): + param_list = [ + + # title -> paper_title, full_name, name + # description -> paper_abstract, description + + # Paper with code + 'title', + 'paper_published', 'paper_url', + 'num_of_implementations', 'tasks', + 'paper_conference', 'repository_url', + 'repository_name', 'repository_framework', + 'repository_stars', + + # Github + 'description', 'private', + 'fork', 'updated_at', + 'stargazers_count', 'watchers_count', + 'language', 'forks_count', + + # Coursera + 'partners_v1', 'instructors_v1', + + # Source Flag + 'source' + ] + + for param in kwargs: + if param not in param_list: + raise AttributeError('{} is not a valid parameter.'.format(param)) + + self.title = kwargs.get('title', None) + self.paper_published = kwargs.get('paper_published', None) + self.paper_url = kwargs.get('paper_url', None) + self.num_of_implementations = kwargs.get('num_of_implementations', None) + self.tasks = kwargs.get('tasks', None) + self.paper_conference = kwargs.get('paper_conference', None) + self.repository_framework = kwargs.get('repository_framework', None) + self.repository_stars = kwargs.get('repository_stars', None) + self.description = kwargs.get('description', None) + self.private = kwargs.get('private', None) + self.fork = kwargs.get('fork', None) + self.updated_at = kwargs.get('updated_at', None) + self.stargazers_count = kwargs.get('stargazers', None) + self.watchers_count = kwargs.get('watchers_count', None) + self.language = kwargs.get('language', None) + self.forks_count = kwargs.get('forks_count', None) + self.partners_v1 = kwargs.get('partners_v1', None) + self.instructors_v1 = kwargs.get('instructors_v1', None) + self.source = kwargs.get('source', None) + + def to_JSON(self): + """Transform the Protocol object to JSON object.""" + + return self.__dict__ + + def __repr__(self): + return str(self.__dict__) \ No newline at end of file diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch new file mode 100644 index 0000000..abafd34 --- /dev/null +++ b/src/main/scripts/mlsearch @@ -0,0 +1,69 @@ +#!/usr/bin/env python +import argparse +import sys +import pprint +import os + +# For debugging purpose +if 'mlsearch' not in sys.modules: + sys.path.append(os.path.join(os.getcwd(), 'src/main/python')) + +from mlsearch.api_requester import APIRequest +from mlsearch import helper as hp + +ap = argparse.ArgumentParser() +ap.add_argument('-q', '--query', required=True, help="Keyword for searching.") +ap.add_argument('-i', '--init_idx', required=True, help="Initial index for pagination.") +ap.add_argument('-c', '--count', required=True, help="Total number of results to be fetched.") +ap.add_argument('-s', '--source', required=True, help="Source API to be looking for.") +ap.add_argument('-u', '--pwc_user', required=False, help="Paper with code repository user name.") +ap.add_argument('-p', '--pwc_password', required=False, help="Paper with code repository password.") +ap.add_argument('-t', '--github_acc_token', required=False, help="Github access token.") +args = vars(ap.parse_args()) + +def main(event): + try: + param_names = ['query', 'init_idx', 'count', 'source'] + response_msg = hp.response('success', 200) + + if hp.is_valid_parameters(event, param_names): + params = hp.parse_parameters(event) + if params.values(): + api_request = APIRequest(params['source'], params['query'], params['init_idx'], params['count']) + if 'pwc_user'in event and 'pwc_password' in event: + api_request.pwc_auth_info = (event['pwc_user'], event['pwc_password']) + if 'github_acc_token' in event: + api_request.github_acc_token = event['github_acc_token'] + data = api_request.fetch_data() + response_msg = hp.response(data.get('content',''), data.get('response_code')) + + return response_msg + + response_msg = hp.response('Invalid parameters.', 400) + return response_msg + + except (ValueError, TypeError): + response_msg = hp.response('Invalid parameters.', 400) + + except Exception as ex: + response_msg = hp.response(str(ex), 500) + return response_msg + +if __name__ == "__main__": + event = { + 'query': args['query'], + 'init_idx': args['init_idx'], + 'count': args['count'], + 'source': args['source'] + } + + if args['pwc_user']: + event['pwc_user'] = args['pwc_user'] + if args['pwc_password']: + event['pwc_password'] = args['pwc_password'] + if args['github_acc_token']: + event['github_acc_token'] = args['github_acc_token'] + + result = main(event) + pp = pprint.PrettyPrinter(indent=2) + pp.pprint(result) \ No newline at end of file diff --git a/src/main/scripts/mlt_github_requester b/src/main/scripts/mlt_github_requester deleted file mode 100644 index 5946693..0000000 --- a/src/main/scripts/mlt_github_requester +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function - -import os -import sys -import getopt - -from apirequester.githubrequester import github_requester - -def get_responses(github_access_token, keywords): - github_response = github_requester(github_access_token, keywords) - - for url, stars in github_response.items(): - print(f'{url}, {stars} stars') - - -def main(argv): - github_access_token = '' # please set here your github_access_token - keywords = '' - - try: - opts, argv = getopt.getopt(argv, "hga:k", ["github_access_token=", "keywords="]) - - except getopt.GetoptError: - print("mlt_github_requester -ga -k ") - sys.exit(2) - - for opt, arg in opts: - if opt == "-h": - print("mlt_github_requester -ga -k ") - elif opt in ("-ga", "github_access_token="): - github_access_token = arg - elif opt in ("-k", "keywords"): - keywords = ' '.join(argv) - - get_responses(github_access_token, keywords) - - - - -if __name__ == "__main__": - main(sys.argv[1:]) \ No newline at end of file diff --git a/src/unittest/python/myproject_tests.py b/src/unittest/python/myproject_tests.py new file mode 100644 index 0000000..8aa0c2f --- /dev/null +++ b/src/unittest/python/myproject_tests.py @@ -0,0 +1,13 @@ +from unittest import TestCase +from mock import Mock +from mlsearch import greet +from mlsearch.api_requester import APIRequest + + +class Test(TestCase): + def test_should_write_hello_world(self): + mock_stdout = Mock() + + greet(mock_stdout) + + mock_stdout.write.assert_called_with("Hello world!\n") \ No newline at end of file From 0340e74525704fc903d79981ced4acc0f9b036c6 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sat, 18 Jan 2020 20:48:14 +0900 Subject: [PATCH 03/23] Remove botocore as it seems to be removed by 2020/03. --- src/main/python/mlsearch/api_requester.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 442ee0d..9ae107f 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -1,16 +1,11 @@ from mlsearch.config import Config from mlsearch.protocol import Protocol from github import Github +from requests.auth import HTTPBasicAuth import json +import requests # import scholarly -try: - from botocore.vendored import requests - from botocore.vendored.requests.auth import HTTPBasicAuth -except ModuleNotFoundError: - import requests - from requests.auth import HTTPBasicAuth - class APIRequest(): """For handling the different Valid API requests.""" From cf9aa10171ccba1362b37d3e98a14e0d83782229 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sat, 18 Jan 2020 20:54:48 +0900 Subject: [PATCH 04/23] Remove old readme file. --- README.adoc | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 README.adoc diff --git a/README.adoc b/README.adoc deleted file mode 100644 index 8c9e50b..0000000 --- a/README.adoc +++ /dev/null @@ -1,35 +0,0 @@ -# search-api-requester -API requester for recommendation system - -## Development -Checkout github repository and change directory to repository - -Please set your github_access_token in ```src/main/scripts/mlt_github_requester``` for initial development and testing execution as this is initial code and will be updated later - -create conda environment change to that environment - -``` -conda create -n py36 python=3.6 -source activate py36 -``` - -Intsall pybuilder - -``` -pip install pybuilder -``` - -Execute following command to build the python egg - -``` -pyb && pyb -X && easy_install target/dist/*/dist/*.tar.gz -``` - -Testing example - -``` -mlt_github_requester -k 'machine learning' -``` - -## Notes -Please check the ```src/main/scripts/mlt_github_requester``` and ```src/main/python/apirequester/githubrequester.py``` for example and also when testing, please build the python egg ```pyb && pyb -X && easy_install target/dist/*/dist/*.tar.gz``` so that egg will be build for testing. Please ask if any question. Still under development so not the production level code. pypi installation setup will be done later From 59085f7b052b7d7de6c632e732cfbac481892710 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Wed, 22 Jan 2020 14:23:58 +0900 Subject: [PATCH 05/23] Fix rheza review comment. --- src/main/python/mlsearch/api_requester.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 9ae107f..23a2f83 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -63,11 +63,10 @@ def _validate_params(self): """Validate user input data.""" for item, typ in self.params_model.items(): - if item in self.params.keys(): - if not typ == type(self.params[item]): - raise TypeError( - f'Invalid type for {item}. {typ} is expected but ' - f'{type(self.params[item])} is given.') + if item in self.params.keys() and not typ == type(self.params[item]): + raise TypeError( + f'Invalid type for {item}. {typ} is expected but ' + f'{type(self.params[item])} is given.') if self.params['source'] not in self._config.VALID_API_SOURCE: raise ValueError( From 1835db96c4c87bd967596cef78e896786f4d0430 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 2 Feb 2020 00:16:23 +0900 Subject: [PATCH 06/23] Adding the POST request attributes. --- .gitignore | 3 ++- src/main/python/mlsearch/helper.py | 9 ++++++++- src/main/scripts/mlsearch | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 9e4dacc..c8d98ed 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ /target __pycache__ .coverage -.DS_Store \ No newline at end of file +.DS_Store +.vscode \ No newline at end of file diff --git a/src/main/python/mlsearch/helper.py b/src/main/python/mlsearch/helper.py index e019032..d7c13d8 100644 --- a/src/main/python/mlsearch/helper.py +++ b/src/main/python/mlsearch/helper.py @@ -31,7 +31,11 @@ def parse_parameters(event): Parse the parameters from event dictionary. :param event: The event dictionary. - :return: dict('query', 'init_idx', 'count') + :return: dict( + 'query', 'init_idx', + 'count', 'source', + 'cookies', 'ip_address', + 'timestamp') """ try: param = dict() @@ -39,6 +43,9 @@ def parse_parameters(event): param['init_idx'] = int(event['init_idx']) param['count'] = int(event['count']) param['source'] = event['source'] + param['cookies'] = event['cookies'] + param['ip_address'] = event['ip_address'] + param['timestamp'] = event['timestamp'] if param['init_idx'] >= 0 and param['count'] > 0: return param diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index abafd34..666b6b4 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -16,6 +16,9 @@ ap.add_argument('-q', '--query', required=True, help="Keyword for searching.") ap.add_argument('-i', '--init_idx', required=True, help="Initial index for pagination.") ap.add_argument('-c', '--count', required=True, help="Total number of results to be fetched.") ap.add_argument('-s', '--source', required=True, help="Source API to be looking for.") +ap.add_argument('-ck', '--cookies', required=True, help="Cookies of current user.") +ap.add_argument('-ip', '--ip_address', required=True, help="IP address of current user.") +ap.add_argument('-tm', '--timestamp', required=True, help="Timestamp of requesting API.") ap.add_argument('-u', '--pwc_user', required=False, help="Paper with code repository user name.") ap.add_argument('-p', '--pwc_password', required=False, help="Paper with code repository password.") ap.add_argument('-t', '--github_acc_token', required=False, help="Github access token.") @@ -23,7 +26,11 @@ args = vars(ap.parse_args()) def main(event): try: - param_names = ['query', 'init_idx', 'count', 'source'] + param_names = [ + 'query', 'init_idx', + 'count', 'source', + 'cookies', 'ip_address', + 'timestamp'] response_msg = hp.response('success', 200) if hp.is_valid_parameters(event, param_names): @@ -54,7 +61,10 @@ if __name__ == "__main__": 'query': args['query'], 'init_idx': args['init_idx'], 'count': args['count'], - 'source': args['source'] + 'source': args['source'], + 'cookies': args['cookies'], + 'ip_address': args['ip_address'], + 'timestamp': args['timestamp'] } if args['pwc_user']: From 0c34b44de50d42be393b586d16f9608a85b08ae3 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 2 Feb 2020 22:16:04 +0900 Subject: [PATCH 07/23] Removing the IP_address attribute. --- src/main/python/mlsearch/helper.py | 4 +--- src/main/scripts/mlsearch | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/python/mlsearch/helper.py b/src/main/python/mlsearch/helper.py index d7c13d8..ffc274d 100644 --- a/src/main/python/mlsearch/helper.py +++ b/src/main/python/mlsearch/helper.py @@ -34,8 +34,7 @@ def parse_parameters(event): :return: dict( 'query', 'init_idx', 'count', 'source', - 'cookies', 'ip_address', - 'timestamp') + 'cookies', 'timestamp') """ try: param = dict() @@ -44,7 +43,6 @@ def parse_parameters(event): param['count'] = int(event['count']) param['source'] = event['source'] param['cookies'] = event['cookies'] - param['ip_address'] = event['ip_address'] param['timestamp'] = event['timestamp'] if param['init_idx'] >= 0 and param['count'] > 0: diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index 666b6b4..1e9b1a9 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -17,7 +17,6 @@ ap.add_argument('-i', '--init_idx', required=True, help="Initial index for pagin ap.add_argument('-c', '--count', required=True, help="Total number of results to be fetched.") ap.add_argument('-s', '--source', required=True, help="Source API to be looking for.") ap.add_argument('-ck', '--cookies', required=True, help="Cookies of current user.") -ap.add_argument('-ip', '--ip_address', required=True, help="IP address of current user.") ap.add_argument('-tm', '--timestamp', required=True, help="Timestamp of requesting API.") ap.add_argument('-u', '--pwc_user', required=False, help="Paper with code repository user name.") ap.add_argument('-p', '--pwc_password', required=False, help="Paper with code repository password.") @@ -29,8 +28,7 @@ def main(event): param_names = [ 'query', 'init_idx', 'count', 'source', - 'cookies', 'ip_address', - 'timestamp'] + 'cookies', 'timestamp'] response_msg = hp.response('success', 200) if hp.is_valid_parameters(event, param_names): @@ -63,7 +61,6 @@ if __name__ == "__main__": 'count': args['count'], 'source': args['source'], 'cookies': args['cookies'], - 'ip_address': args['ip_address'], 'timestamp': args['timestamp'] } From c2f29704e3d83bbee1cf4c898493b8eec6027002 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 2 Feb 2020 22:32:53 +0900 Subject: [PATCH 08/23] Updating the readme. --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fa15a26..4366464 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,10 @@ Required Parameters: Total number of results to be fetched. -s SOURCE, --source SOURCE Source API to be looking for. - + -ck COOKIES, --cookies COOKIES + Cookies of current user. + -tm TIMESTAMP, --timestamp TIMESTAMP + Timestamp of requesting API. Optional Parameters: -u PWC_USER, --pwc_user PWC_USER @@ -40,6 +43,7 @@ Optional Parameters:
Using as an API
+ ```python from mlsearch.api_requester import APIRequest From 89a3fd4bf8130b006eae22b43d1d327c9883f83a Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Thu, 13 Feb 2020 20:58:25 +0900 Subject: [PATCH 09/23] Adding and fixing the POST Attr missing. --- src/main/python/mlsearch/api_requester.py | 3 ++- src/main/python/mlsearch/protocol.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 23a2f83..bccfce5 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -122,7 +122,7 @@ def _fetch_paperwithcode(self) -> [Protocol]: 'title': item.get('paper_title', None), 'description': item.get('paper_abstract', None), 'paper_url': item.get('paper_url', None), - 'num_of_implementations': item.get('num_of_implementations', None), + 'num_of_implementations': item.get('number_of_implementations', None), 'tasks': item.get('tasks', None), 'paper_conference': item.get('paper_conference', None), 'repository_url': item.get('repository_url', None), @@ -130,6 +130,7 @@ def _fetch_paperwithcode(self) -> [Protocol]: 'repository_framework': item.get('repository_framework', None), 'repository_stars': item.get('repository_stars', None), 'paper_published': item.get('paper_published', None), + 'pwc_url': item.get('pwc_url', ''), 'source': self.params.get('source', '') } results.append(Protocol(data)) diff --git a/src/main/python/mlsearch/protocol.py b/src/main/python/mlsearch/protocol.py index efc2d28..05c67aa 100644 --- a/src/main/python/mlsearch/protocol.py +++ b/src/main/python/mlsearch/protocol.py @@ -13,7 +13,7 @@ def __init__(self, kwargs): 'num_of_implementations', 'tasks', 'paper_conference', 'repository_url', 'repository_name', 'repository_framework', - 'repository_stars', + 'repository_stars', 'pwc_url', # Github 'description', 'private', @@ -31,26 +31,29 @@ def __init__(self, kwargs): for param in kwargs: if param not in param_list: raise AttributeError('{} is not a valid parameter.'.format(param)) - + self.title = kwargs.get('title', None) self.paper_published = kwargs.get('paper_published', None) self.paper_url = kwargs.get('paper_url', None) self.num_of_implementations = kwargs.get('num_of_implementations', None) self.tasks = kwargs.get('tasks', None) self.paper_conference = kwargs.get('paper_conference', None) + self.repository_url = kwargs.get('repository_url', None) + self.repository_name = kwargs.get('repository_name', None) self.repository_framework = kwargs.get('repository_framework', None) self.repository_stars = kwargs.get('repository_stars', None) self.description = kwargs.get('description', None) self.private = kwargs.get('private', None) self.fork = kwargs.get('fork', None) self.updated_at = kwargs.get('updated_at', None) - self.stargazers_count = kwargs.get('stargazers', None) + self.stargazers_count = kwargs.get('stargazers_count', None) self.watchers_count = kwargs.get('watchers_count', None) self.language = kwargs.get('language', None) self.forks_count = kwargs.get('forks_count', None) self.partners_v1 = kwargs.get('partners_v1', None) self.instructors_v1 = kwargs.get('instructors_v1', None) self.source = kwargs.get('source', None) + self.pwc_url = kwargs.get('pwc_url', None) def to_JSON(self): """Transform the Protocol object to JSON object.""" From 22eafc66be2f510ba574c7956c3cc4cb2d509de9 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Fri, 21 Feb 2020 16:04:52 +0900 Subject: [PATCH 10/23] Adding the last_page flag. --- src/main/python/mlsearch/api_requester.py | 33 +++++++++++++++++++---- src/main/python/mlsearch/helper.py | 13 +++++---- src/main/scripts/mlsearch | 5 +++- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index bccfce5..33076c2 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -29,7 +29,10 @@ def __init__(self, source, query, init_idx, count): # Validate Params self._validate_params() # Response data - self.data = {'response_code': 201, 'content': None} + self.data = { + 'response_code': 201, + 'content': None, + 'has_next_page': False} @property def github_acc_token(self): @@ -72,6 +75,18 @@ def _validate_params(self): raise ValueError( f"Invalid value for {self.params['source']}. " f"Expected values are {self._config.VALID_API_SOURCE}") + + def _is_valid_pagination(self, max_count=0): + """Validate pagination.""" + # If init_idx is greater than acutal content + if max_count == 0 or self.params['init_idx'] > max_count: + return False + + # Update pagination flag. + self.data['has_next_page'] = self.params['init_idx'] + \ + self.params['count'] < max_count + + return True def _fetch_github(self) -> [Protocol]: """Fetch Github Repository""" @@ -81,9 +96,12 @@ def _fetch_github(self) -> [Protocol]: responses = github.search_repositories(query, 'stars', 'desc') results = [] + if not self._is_valid_pagination(responses.totalCount): + return + for response in responses[ - self.params['init_idx']:self.params['init_idx'] + \ - self.params['count']]: + self.params['init_idx']:min(self.params['init_idx'] + \ + self.params['count'], responses.totalCount)]: data = { 'repository_url' : response.clone_url.replace('.git', ''), @@ -114,8 +132,13 @@ def _fetch_paperwithcode(self) -> [Protocol]: if query_result.status_code == 200: content = json.loads(query_result.content) - content = content[self.params['init_idx']:self.params['init_idx'] + \ - self.params['count']] + max_content = len(content) + if not self._is_valid_pagination(max_content): + return + + content = content[ + self.params['init_idx']:min(self.params['init_idx'] + \ + self.params['count'], max_content)] for item in content: data = { diff --git a/src/main/python/mlsearch/helper.py b/src/main/python/mlsearch/helper.py index ffc274d..e625f22 100644 --- a/src/main/python/mlsearch/helper.py +++ b/src/main/python/mlsearch/helper.py @@ -12,18 +12,21 @@ def is_valid_parameters(event, param_names): return False return True -def response(message, status_code): +def response(message, status_code, optional_attributes=dict()): """ Response message for the request. - :param message: The response message. - :param status_code: The response status. + :param message: The response message. + :param status_code: The response status. + :optional_attributes: The dict key value used by backend to communicate + with front end. - :return: The dic('statusCode', 'body') + :return: The dic('statusCode', 'body', 'optional_attributes') """ return { 'statusCode': status_code, - 'body': message + 'body': message, + 'optional_attributes': optional_attributes } def parse_parameters(event): diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index 1e9b1a9..fe4c7ef 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -40,7 +40,10 @@ def main(event): if 'github_acc_token' in event: api_request.github_acc_token = event['github_acc_token'] data = api_request.fetch_data() - response_msg = hp.response(data.get('content',''), data.get('response_code')) + response_msg = hp.response( + data.get('content',''), + data.get('response_code'), + dict({'has_next_page': data.get('has_next_page', False)})) return response_msg From d1a020d70ee56f83229ca1ec56fb1bf10679bfd4 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Fri, 21 Feb 2020 19:22:51 +0900 Subject: [PATCH 11/23] Fixing the response format. --- src/main/python/mlsearch/helper.py | 9 ++++++--- src/main/scripts/mlsearch | 15 +++++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/main/python/mlsearch/helper.py b/src/main/python/mlsearch/helper.py index e625f22..e53bef8 100644 --- a/src/main/python/mlsearch/helper.py +++ b/src/main/python/mlsearch/helper.py @@ -1,3 +1,5 @@ +import json + def is_valid_parameters(event, param_names): """ Check whether the item in param_names exist in event dictionary. @@ -12,12 +14,13 @@ def is_valid_parameters(event, param_names): return False return True -def response(message, status_code, optional_attributes=dict()): +def response(message, status_code, headers=dict(), optional_attributes=dict()): """ Response message for the request. :param message: The response message. :param status_code: The response status. + :headers: The header of the response. :optional_attributes: The dict key value used by backend to communicate with front end. @@ -25,8 +28,8 @@ def response(message, status_code, optional_attributes=dict()): """ return { 'statusCode': status_code, - 'body': message, - 'optional_attributes': optional_attributes + 'body': json.dumps({'content': message, 'optional_attributes': optional_attributes}), + 'headers': headers } def parse_parameters(event): diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index fe4c7ef..085c385 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -3,6 +3,7 @@ import argparse import sys import pprint import os +import json # For debugging purpose if 'mlsearch' not in sys.modules: @@ -24,6 +25,12 @@ ap.add_argument('-t', '--github_acc_token', required=False, help="Github access args = vars(ap.parse_args()) def main(event): + headers = { + 'Access-Control-Allow-Origin': '*', + 'X-Requested-With': '*', + 'Access-Control-Allow-Headers': 'Content-Type,X-Amz-Date,Authorization,X-Api-Key,x-requested-with', + 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET' + } try: param_names = [ 'query', 'init_idx', @@ -41,10 +48,10 @@ def main(event): api_request.github_acc_token = event['github_acc_token'] data = api_request.fetch_data() response_msg = hp.response( - data.get('content',''), - data.get('response_code'), - dict({'has_next_page': data.get('has_next_page', False)})) - + message=data.get('content',''), + status_code=data.get('response_code'), + headers=headers, + optional_attributes={'has_next_page': data.get('has_next_page', False)}) return response_msg response_msg = hp.response('Invalid parameters.', 400) From 704d579b4d5fd671b4819a7b95e89408a2ae7b63 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 8 Mar 2020 20:22:35 +0900 Subject: [PATCH 12/23] Adding Youtube API. Modify Error Message Update README.md Update README.md Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update src/main/scripts/mlsearch Update src/main/scripts/mlsearch Update README.md Update README.md Update README.md Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update README.md Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update src/main/python/mlsearch/api_requester.py Update src/main/scripts/mlsearch Update src/main/python/mlsearch/helper.py Update src/main/python/mlsearch/helper.py Update src/main/scripts/mlsearch Update src/main/scripts/mlsearch Update src/main/scripts/mlsearch Update src/main/scripts/mlsearch Fix missing source in youtube. --- README.md | 13 ++-- build.py | 1 + requirements.txt | 4 +- src/main/python/mlsearch/api_requester.py | 84 ++++++++++++++++++++--- src/main/python/mlsearch/config.py | 10 ++- src/main/python/mlsearch/helper.py | 6 +- src/main/python/mlsearch/protocol.py | 14 +++- src/main/scripts/mlsearch | 38 +++++++--- 8 files changed, 141 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 4366464..c425d65 100644 --- a/README.md +++ b/README.md @@ -32,12 +32,16 @@ Required Parameters: Timestamp of requesting API. Optional Parameters: - -u PWC_USER, --pwc_user PWC_USER + -pu PWC_USER, --pwc_user PWC_USER Paper with code repository user name. - -p PWC_PASSWORD, --pwc_password PWC_PASSWORD + -pp PWC_PASSWORD, --pwc_password PWC_PASSWORD Paper with code repository password. - -t GITHUB_ACC_TOKEN, --github_acc_token GITHUB_ACC_TOKEN + -gt GITHUB_ACC_TOKEN, --github_acc_token GITHUB_ACC_TOKEN Github access token. + -yk YOUTUBE_DEV_KEY, --youtube_dev_key YOUTUBE_DEV_KEY + Youtube developer key. + -ynpt NEXT_PAGE_TOKEN, --y_next_page_token NEXT_PAGE_TOKEN + Next page token for Youtube API. ```
@@ -51,4 +55,5 @@ api_request = APIRequest(source, query, init_idx, count) api_request.pwc_auth_info = ('user_name', 'password') api_request.github_acc_token = 'token' -``` \ No newline at end of file +api_request.youtube_developer_key = 'your_key' +``` diff --git a/build.py b/build.py index cb91233..dd34c76 100644 --- a/build.py +++ b/build.py @@ -18,3 +18,4 @@ def set_properties(project): project.build_depends_on("mock") project.build_depends_on("requests") project.build_depends_on("pygithub") + project.build_depends_on("google-api-python-client") diff --git a/requirements.txt b/requirements.txt index 656236f..c118007 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ PyGithub==1.43.8 -pybuilder \ No newline at end of file +pybuilder +requests +google-api-python-client \ No newline at end of file diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 33076c2..3077ba5 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -2,6 +2,7 @@ from mlsearch.protocol import Protocol from github import Github from requests.auth import HTTPBasicAuth +import googleapiclient.discovery import json import requests # import scholarly @@ -10,7 +11,7 @@ class APIRequest(): """For handling the different Valid API requests.""" - def __init__(self, source, query, init_idx, count): + def __init__(self, source, query, init_idx, count, y_next_page_token=None): """ Initialization for the class. @@ -18,10 +19,12 @@ def __init__(self, source, query, init_idx, count): :param query: The query for searching. :param init_idx: The initial pagination index. :param count: The number of records to be fetched. + :param y_next_page_token: The current page token for youtube API. """ self.params = {'query':query, 'init_idx':init_idx, - 'count':count, 'source': source} + 'count':count, 'source': source, + 'y_next_page_token': y_next_page_token} self.params_model = {'query':str, 'init_idx':int, 'count':int} # Load the configuration file @@ -32,7 +35,8 @@ def __init__(self, source, query, init_idx, count): self.data = { 'response_code': 201, 'content': None, - 'has_next_page': False} + 'has_next_page': False, + 'y_next_page_token': None} @property def github_acc_token(self): @@ -43,6 +47,14 @@ def github_acc_token(self, access_token): if access_token: self._config.GITHUB_ACC_TOKEN = access_token + @property + def youtube_developer_key(self): + return self._config.YOUTUBE_DEVELOPER_KEY + + @youtube_developer_key.setter + def youtube_developer_key(self, developer_key): + if developer_key: + self._config.YOUTUBE_DEVELOPER_KEY = developer_key @property def pwc_auth_info(self): @@ -70,7 +82,6 @@ def _validate_params(self): raise TypeError( f'Invalid type for {item}. {typ} is expected but ' f'{type(self.params[item])} is given.') - if self.params['source'] not in self._config.VALID_API_SOURCE: raise ValueError( f"Invalid value for {self.params['source']}. " @@ -118,8 +129,8 @@ def _fetch_github(self) -> [Protocol]: } results.append(Protocol(data)) - self.data['response_code'] = 200 - self.data['content'] = [proto.to_JSON() for proto in results] + self.data['response_code'] = 200 + self.data['content'] = [proto.to_JSON() for proto in results] def _fetch_paperwithcode(self) -> [Protocol]: """Fetch Paper with Code Repository""" @@ -161,7 +172,59 @@ def _fetch_paperwithcode(self) -> [Protocol]: self.data['content'] = [proto.to_JSON() for proto in results] self.data['response_code'] = query_result.status_code - + + def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: + """Fetch the Youtube Repository""" + results = [] + youtube = googleapiclient.discovery.build( + self._config.YOUTUBE_SERVICE_NAME, + self._config.YOUTUBE_API_VERSION, + developerKey = self._config.YOUTUBE_DEVELOPER_KEY) + request = youtube.search().list( + part=self._config.YOUTUBE_PART, + maxResults=self.params['count'], + order=self._config.YOUTUBE_ORDER, + q=self.params['query'], + safeSearch=self._config.YOUTUBE_SAFESEARCH, + pageToken=y_next_page_token + ) + response = request.execute() + + if 'items' in response and len(response['items']) > 0: + for item in response['items']: + data = { + 'video_id': item.get( + 'id', dict({'videoId': None}) + ).get('videoId', None), + 'title': item.get( + 'snippet', dict({'title': None}) + ).get('title', None), + 'description': item.get( + 'snippet',dict({'description': None}) + ).get('description', None), + 'channel_id': item.get( + 'snippet',dict({'channelId': None}) + ).get('channelId', None), + 'channel_title': item.get( + 'snippet',dict({'channelTitle': None}) + ).get('channelTitle', None), + 'live_broadcast_content': item.get( + 'snippet',dict({'liveBroadcastContent': None}) + ).get('liveBroadcastContent', None), + 'published_datetime': item.get( + 'snippet',dict({'publishedAt': None}) + ).get('publishedAt', None), + 'thumbnails': item.get( + 'snippet',dict({'thumbnails': None}) + ).get('thumbnails', None), + 'source': self.params.get('source', ''), + } + results.append(Protocol(data)) + self.data['y_next_page_token'] = response.get('nextPageToken', None) + self.data['content'] = [proto.to_JSON() for proto in results] + self.data['has_next_page'] = response.get('pageInfo', dict({'totalResults':0})).get('totalResults', 0) > 0 + self.data['response_code'] = 200 + def fetch_data(self) -> json: """Fetch the data from designated API source.""" @@ -170,7 +233,10 @@ def fetch_data(self) -> json: self._fetch_paperwithcode() if self.params.get('source', '') == 'github': - responses = self._fetch_github() + self._fetch_github() + + if self.params.get('source', '') == 'youtube': + self._fetch_youtube(self.params.get('y_next_page_token', None)) # TODO: Implement the function for Coursera. However, this function # may be handled by the backend server. @@ -181,4 +247,4 @@ def fetch_data(self) -> json: self.data['response_code'] = 500 self.data['content'] = str(ex) - return self.data \ No newline at end of file + return self.data diff --git a/src/main/python/mlsearch/config.py b/src/main/python/mlsearch/config.py index fdbc95a..c53574f 100644 --- a/src/main/python/mlsearch/config.py +++ b/src/main/python/mlsearch/config.py @@ -13,4 +13,12 @@ class Config(object): GITHUB_URL = os.environ.get('GITHUB_URL') or "in:readme+in:description" # AIP Source - VALID_API_SOURCE = ['paperwithcode', 'github', 'coursera'] \ No newline at end of file + VALID_API_SOURCE = ['paperwithcode', 'github', 'coursera', 'youtube'] + + # Youtube configuration + YOUTUBE_SERVICE_NAME = os.environ.get('YOUTUBE_SERVICE_NAME') or "youtube" + YOUTUBE_API_VERSION = os.environ.get('YOUTUBE_API_VERSION') or "v3" + YOUTUBE_DEVELOPER_KEY = os.environ.get('YOUTUBE_DEVELOPER_KEY') or None + YOUTUBE_ORDER = os.environ.get('YOUTUBE_ORDER') or "relevance" + YOUTUBE_SAFESEARCH = os.environ.get('YOUTUBE_SAFESEARCH') or "strict" + YOUTUBE_PART = os.environ.get('YOUTUBE_PART') or "snippet" \ No newline at end of file diff --git a/src/main/python/mlsearch/helper.py b/src/main/python/mlsearch/helper.py index e53bef8..111cf5f 100644 --- a/src/main/python/mlsearch/helper.py +++ b/src/main/python/mlsearch/helper.py @@ -40,7 +40,8 @@ def parse_parameters(event): :return: dict( 'query', 'init_idx', 'count', 'source', - 'cookies', 'timestamp') + 'cookies', 'timestamp', + 'y_next_page_token') """ try: param = dict() @@ -50,6 +51,7 @@ def parse_parameters(event): param['source'] = event['source'] param['cookies'] = event['cookies'] param['timestamp'] = event['timestamp'] + param['y_next_page_token'] = event['y_next_page_token'] if param['init_idx'] >= 0 and param['count'] > 0: return param @@ -57,4 +59,4 @@ def parse_parameters(event): return dict() except: - return dict() \ No newline at end of file + return dict() diff --git a/src/main/python/mlsearch/protocol.py b/src/main/python/mlsearch/protocol.py index 05c67aa..826cc46 100644 --- a/src/main/python/mlsearch/protocol.py +++ b/src/main/python/mlsearch/protocol.py @@ -25,7 +25,13 @@ def __init__(self, kwargs): 'partners_v1', 'instructors_v1', # Source Flag - 'source' + 'source', + + # Youtube + 'video_id', + 'channel_id', 'channel_title', + 'live_broadcast_content', 'published_datetime', + 'thumbnails', ] for param in kwargs: @@ -54,6 +60,12 @@ def __init__(self, kwargs): self.instructors_v1 = kwargs.get('instructors_v1', None) self.source = kwargs.get('source', None) self.pwc_url = kwargs.get('pwc_url', None) + self.video_id = kwargs.get('video_id', None) + self.channel_id = kwargs.get('channel_id', None) + self.channel_title = kwargs.get('channel_title', None) + self.live_broadcast_content = kwargs.get('live_broadcast_content', None) + self.published_datetime = kwargs.get('published_datetime', None) + self.thumbnails = kwargs.get('thumbnails', dict()) def to_JSON(self): """Transform the Protocol object to JSON object.""" diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index 085c385..0d1b98e 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -19,9 +19,11 @@ ap.add_argument('-c', '--count', required=True, help="Total number of results to ap.add_argument('-s', '--source', required=True, help="Source API to be looking for.") ap.add_argument('-ck', '--cookies', required=True, help="Cookies of current user.") ap.add_argument('-tm', '--timestamp', required=True, help="Timestamp of requesting API.") -ap.add_argument('-u', '--pwc_user', required=False, help="Paper with code repository user name.") -ap.add_argument('-p', '--pwc_password', required=False, help="Paper with code repository password.") -ap.add_argument('-t', '--github_acc_token', required=False, help="Github access token.") +ap.add_argument('-pu', '--pwc_user', required=False, help="Paper with code repository user name.") +ap.add_argument('-pp', '--pwc_password', required=False, help="Paper with code repository password.") +ap.add_argument('-gt', '--github_acc_token', required=False, help="Github access token.") +ap.add_argument('-yk', '--youtube_dev_key', required=False, help="Youtube developer key.") +ap.add_argument('-yntp', '--y_next_page_token', required=False, help="Next page token for Youtube API.") args = vars(ap.parse_args()) def main(event): @@ -35,30 +37,41 @@ def main(event): param_names = [ 'query', 'init_idx', 'count', 'source', - 'cookies', 'timestamp'] + 'cookies', 'timestamp', + 'y_next_page_token'] response_msg = hp.response('success', 200) - if hp.is_valid_parameters(event, param_names): params = hp.parse_parameters(event) if params.values(): - api_request = APIRequest(params['source'], params['query'], params['init_idx'], params['count']) + api_request = APIRequest( + params['source'], + params['query'], + params['init_idx'], + params['count'], + params['y_next_page_token']) if 'pwc_user'in event and 'pwc_password' in event: api_request.pwc_auth_info = (event['pwc_user'], event['pwc_password']) if 'github_acc_token' in event: api_request.github_acc_token = event['github_acc_token'] + if 'youtube_developer_key' in event: + api_request.youtube_developer_key = event['youtube_developer_key'] data = api_request.fetch_data() response_msg = hp.response( message=data.get('content',''), status_code=data.get('response_code'), headers=headers, - optional_attributes={'has_next_page': data.get('has_next_page', False)}) + optional_attributes={ + 'has_next_page': data.get('has_next_page', False), + 'y_next_page_token': data.get('y_next_page_token', None)}) + return response_msg response_msg = hp.response('Invalid parameters.', 400) return response_msg - except (ValueError, TypeError): - response_msg = hp.response('Invalid parameters.', 400) + except (ValueError, TypeError) as ex: + response_msg = hp.response(str(ex), 400) + return response_msg except Exception as ex: response_msg = hp.response(str(ex), 500) @@ -71,7 +84,8 @@ if __name__ == "__main__": 'count': args['count'], 'source': args['source'], 'cookies': args['cookies'], - 'timestamp': args['timestamp'] + 'timestamp': args['timestamp'], + 'y_next_page_token': args['y_next_page_token'] } if args['pwc_user']: @@ -80,7 +94,9 @@ if __name__ == "__main__": event['pwc_password'] = args['pwc_password'] if args['github_acc_token']: event['github_acc_token'] = args['github_acc_token'] + if args['youtube_dev_key']: + event['youtube_developer_key'] = args['youtube_dev_key'] result = main(event) pp = pprint.PrettyPrinter(indent=2) - pp.pprint(result) \ No newline at end of file + pp.pprint(result) From 7f9cf74788c25c8ea9c67112470c3c203fae228c Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Wed, 6 May 2020 23:49:18 +0900 Subject: [PATCH 13/23] Fix html escape and code formatting. --- src/main/python/mlsearch/api_requester.py | 275 +++++++++++++--------- 1 file changed, 167 insertions(+), 108 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 3077ba5..860c0c8 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -5,10 +5,12 @@ import googleapiclient.discovery import json import requests +import html + # import scholarly -class APIRequest(): +class APIRequest: """For handling the different Valid API requests.""" def __init__(self, source, query, init_idx, count, y_next_page_token=None): @@ -22,21 +24,25 @@ def __init__(self, source, query, init_idx, count, y_next_page_token=None): :param y_next_page_token: The current page token for youtube API. """ - self.params = {'query':query, 'init_idx':init_idx, - 'count':count, 'source': source, - 'y_next_page_token': y_next_page_token} - self.params_model = {'query':str, 'init_idx':int, - 'count':int} + self.params = { + "query": query, + "init_idx": init_idx, + "count": count, + "source": source, + "y_next_page_token": y_next_page_token, + } + self.params_model = {"query": str, "init_idx": int, "count": int} # Load the configuration file self._config = Config # Validate Params self._validate_params() # Response data self.data = { - 'response_code': 201, - 'content': None, - 'has_next_page': False, - 'y_next_page_token': None} + "response_code": 201, + "content": None, + "has_next_page": False, + "y_next_page_token": None, + } @property def github_acc_token(self): @@ -62,17 +68,22 @@ def pwc_auth_info(self): @pwc_auth_info.setter def pwc_auth_info(self, auth_info: "tuple(user_name, password)"): - assert isinstance(auth_info, tuple), \ - f"Invalid type for auth_info. Expected tuple but got {type(auth_info)}." + assert isinstance( + auth_info, tuple + ), f"Invalid type for auth_info. Expected tuple but got {type(auth_info)}." if len(auth_info) == 2: - assert isinstance(auth_info[0], str), \ - f"Invalid type for user_name. Expected str but got {type(auth_info[0])}." - assert isinstance(auth_info[1], str), \ - f"Invalid type for password. Expected str but got {type(auth_info[1])}." + assert isinstance( + auth_info[0], str + ), f"Invalid type for user_name. Expected str but got {type(auth_info[0])}." + assert isinstance( + auth_info[1], str + ), f"Invalid type for password. Expected str but got {type(auth_info[1])}." self._config.PWC_USER_NAME = auth_info[0] self._config.PWC_PASSWORD = auth_info[1] else: - raise AttributeError(f"Expected tuple with length 2 but got {len(auth_info)}.") + raise AttributeError( + f"Expected tuple with length 2 but got {len(auth_info)}." + ) def _validate_params(self): """Validate user input data.""" @@ -80,66 +91,82 @@ def _validate_params(self): for item, typ in self.params_model.items(): if item in self.params.keys() and not typ == type(self.params[item]): raise TypeError( - f'Invalid type for {item}. {typ} is expected but ' - f'{type(self.params[item])} is given.') - if self.params['source'] not in self._config.VALID_API_SOURCE: + f"Invalid type for {item}. {typ} is expected but " + f"{type(self.params[item])} is given." + ) + if self.params["source"] not in self._config.VALID_API_SOURCE: raise ValueError( f"Invalid value for {self.params['source']}. " - f"Expected values are {self._config.VALID_API_SOURCE}") + f"Expected values are {self._config.VALID_API_SOURCE}" + ) def _is_valid_pagination(self, max_count=0): """Validate pagination.""" # If init_idx is greater than acutal content - if max_count == 0 or self.params['init_idx'] > max_count: + if max_count == 0 or self.params["init_idx"] > max_count: return False # Update pagination flag. - self.data['has_next_page'] = self.params['init_idx'] + \ - self.params['count'] < max_count + self.data["has_next_page"] = ( + self.params["init_idx"] + self.params["count"] < max_count + ) return True - + + def _unescape(self, text): + """Unescape Html Script.""" + if text: + return html.unescape(text) + return text + def _fetch_github(self) -> [Protocol]: """Fetch Github Repository""" github = Github(self._config.GITHUB_ACC_TOKEN) - query = '+'.join([self.params['query'], self._config.GITHUB_URL]) - responses = github.search_repositories(query, 'stars', 'desc') + query = "+".join([self.params["query"], self._config.GITHUB_URL]) + responses = github.search_repositories(query, "stars", "desc") results = [] if not self._is_valid_pagination(responses.totalCount): return for response in responses[ - self.params['init_idx']:min(self.params['init_idx'] + \ - self.params['count'], responses.totalCount)]: + self.params["init_idx"] : min( + self.params["init_idx"] + self.params["count"], responses.totalCount + ) + ]: data = { - 'repository_url' : response.clone_url.replace('.git', ''), - 'title' : response.name, - 'description' : response.description, - 'private' : response.private, - 'fork' : response.fork, - 'updated_at' : response.updated_at.strftime("%Y%m%dT%H:%M:%S"), - 'stargazers_count' : response.stargazers_count, - 'watchers_count' : response.watchers_count, - 'language' : response.language, - 'forks_count' : response.forks_count, - 'source' : self.params.get('source', '') + "repository_url": self._unescape( + response.clone_url.replace(".git", "") + ), + "title": self._unescape(response.name), + "description": self._unescape(response.description), + "private": self._unescape(response.private), + "fork": self._unescape(response.fork), + "updated_at": self._unescape( + response.updated_at.strftime("%Y%m%dT%H:%M:%S") + ), + "stargazers_count": self._unescape(response.stargazers_count), + "watchers_count": self._unescape(response.watchers_count), + "language": self._unescape(response.language), + "forks_count": self._unescape(response.forks_count), + "source": self.params.get("source", ""), } results.append(Protocol(data)) - - self.data['response_code'] = 200 - self.data['content'] = [proto.to_JSON() for proto in results] + + self.data["response_code"] = 200 + self.data["content"] = [proto.to_JSON() for proto in results] def _fetch_paperwithcode(self) -> [Protocol]: """Fetch Paper with Code Repository""" results = [] url = f"{self._config.PWC_URL}{self.params['query']}" - query_result = requests.get(url, - auth=HTTPBasicAuth(self._config.PWC_USER_NAME, - self._config.PWC_PASSWORD)) + query_result = requests.get( + url, + auth=HTTPBasicAuth(self._config.PWC_USER_NAME, self._config.PWC_PASSWORD), + ) if query_result.status_code == 200: content = json.loads(query_result.content) @@ -148,103 +175,135 @@ def _fetch_paperwithcode(self) -> [Protocol]: return content = content[ - self.params['init_idx']:min(self.params['init_idx'] + \ - self.params['count'], max_content)] + self.params["init_idx"] : min( + self.params["init_idx"] + self.params["count"], max_content + ) + ] for item in content: data = { - 'title': item.get('paper_title', None), - 'description': item.get('paper_abstract', None), - 'paper_url': item.get('paper_url', None), - 'num_of_implementations': item.get('number_of_implementations', None), - 'tasks': item.get('tasks', None), - 'paper_conference': item.get('paper_conference', None), - 'repository_url': item.get('repository_url', None), - 'repository_name': item.get('repository_name', None), - 'repository_framework': item.get('repository_framework', None), - 'repository_stars': item.get('repository_stars', None), - 'paper_published': item.get('paper_published', None), - 'pwc_url': item.get('pwc_url', ''), - 'source': self.params.get('source', '') + "title": self._unescape(item.get("paper_title", None)), + "description": self._unescape(item.get("paper_abstract", None)), + "paper_url": self._unescape(item.get("paper_url", None)), + "num_of_implementations": self._unescape( + item.get("number_of_implementations", None) + ), + "tasks": self._unescape(item.get("tasks", None)), + "paper_conference": self._unescape( + item.get("paper_conference", None) + ), + "repository_url": self._unescape(item.get("repository_url", None)), + "repository_name": self._unescape( + item.get("repository_name", None) + ), + "repository_framework": self._unescape( + item.get("repository_framework", None) + ), + "repository_stars": self._unescape( + item.get("repository_stars", None) + ), + "paper_published": self._unescape( + item.get("paper_published", None) + ), + "pwc_url": self._unescape(item.get("pwc_url", "")), + "source": self.params.get("source", ""), } results.append(Protocol(data)) - self.data['content'] = [proto.to_JSON() for proto in results] + self.data["content"] = [proto.to_JSON() for proto in results] + + self.data["response_code"] = query_result.status_code - self.data['response_code'] = query_result.status_code - def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: """Fetch the Youtube Repository""" results = [] youtube = googleapiclient.discovery.build( - self._config.YOUTUBE_SERVICE_NAME, - self._config.YOUTUBE_API_VERSION, - developerKey = self._config.YOUTUBE_DEVELOPER_KEY) + self._config.YOUTUBE_SERVICE_NAME, + self._config.YOUTUBE_API_VERSION, + developerKey=self._config.YOUTUBE_DEVELOPER_KEY, + ) request = youtube.search().list( part=self._config.YOUTUBE_PART, - maxResults=self.params['count'], + maxResults=self.params["count"], order=self._config.YOUTUBE_ORDER, - q=self.params['query'], + q=self.params["query"], safeSearch=self._config.YOUTUBE_SAFESEARCH, - pageToken=y_next_page_token + pageToken=y_next_page_token, ) response = request.execute() - if 'items' in response and len(response['items']) > 0: - for item in response['items']: + if "items" in response and len(response["items"]) > 0: + for item in response["items"]: data = { - 'video_id': item.get( - 'id', dict({'videoId': None}) - ).get('videoId', None), - 'title': item.get( - 'snippet', dict({'title': None}) - ).get('title', None), - 'description': item.get( - 'snippet',dict({'description': None}) - ).get('description', None), - 'channel_id': item.get( - 'snippet',dict({'channelId': None}) - ).get('channelId', None), - 'channel_title': item.get( - 'snippet',dict({'channelTitle': None}) - ).get('channelTitle', None), - 'live_broadcast_content': item.get( - 'snippet',dict({'liveBroadcastContent': None}) - ).get('liveBroadcastContent', None), - 'published_datetime': item.get( - 'snippet',dict({'publishedAt': None}) - ).get('publishedAt', None), - 'thumbnails': item.get( - 'snippet',dict({'thumbnails': None}) - ).get('thumbnails', None), - 'source': self.params.get('source', ''), + "video_id": self._unescape( + item.get("id", dict({"videoId": None})).get("videoId", None) + ), + "title": self._unescape( + item.get("snippet", dict({"title": None})).get("title", None) + ), + "description": self._unescape( + item.get("snippet", dict({"description": None})).get( + "description", None + ) + ), + "channel_id": self._unescape( + item.get("snippet", dict({"channelId": None})).get( + "channelId", None + ) + ), + "channel_title": self._unescape( + item.get("snippet", dict({"channelTitle": None})).get( + "channelTitle", None + ) + ), + "live_broadcast_content": self._unescape( + item.get("snippet", dict({"liveBroadcastContent": None})).get( + "liveBroadcastContent", None + ) + ), + "published_datetime": self._unescape( + item.get("snippet", dict({"publishedAt": None})).get( + "publishedAt", None + ) + ), + "thumbnails": self._unescape( + item.get("snippet", dict({"thumbnails": None})).get( + "thumbnails", None + ) + ), + "source": self.params.get("source", ""), } results.append(Protocol(data)) - self.data['y_next_page_token'] = response.get('nextPageToken', None) - self.data['content'] = [proto.to_JSON() for proto in results] - self.data['has_next_page'] = response.get('pageInfo', dict({'totalResults':0})).get('totalResults', 0) > 0 - self.data['response_code'] = 200 + self.data["y_next_page_token"] = response.get("nextPageToken", None) + self.data["content"] = [proto.to_JSON() for proto in results] + self.data["has_next_page"] = ( + response.get("pageInfo", dict({"totalResults": 0})).get( + "totalResults", 0 + ) + > 0 + ) + self.data["response_code"] = 200 def fetch_data(self) -> json: """Fetch the data from designated API source.""" try: - if self.params.get('source', '') == 'paperwithcode': + if self.params.get("source", "") == "paperwithcode": self._fetch_paperwithcode() - if self.params.get('source', '') == 'github': + if self.params.get("source", "") == "github": self._fetch_github() - if self.params.get('source', '') == 'youtube': - self._fetch_youtube(self.params.get('y_next_page_token', None)) + if self.params.get("source", "") == "youtube": + self._fetch_youtube(self.params.get("y_next_page_token", None)) # TODO: Implement the function for Coursera. However, this function # may be handled by the backend server. - if self.params.get('source', '') == 'coursera': + if self.params.get("source", "") == "coursera": pass except Exception as ex: - self.data['response_code'] = 500 - self.data['content'] = str(ex) + self.data["response_code"] = 500 + self.data["content"] = str(ex) return self.data From 38de12c2cee8a195c392ec224957792a754d1f08 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Wed, 6 May 2020 23:54:03 +0900 Subject: [PATCH 14/23] Fix int error on html unescape. --- src/main/python/mlsearch/api_requester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 860c0c8..30082d2 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -115,7 +115,7 @@ def _is_valid_pagination(self, max_count=0): def _unescape(self, text): """Unescape Html Script.""" - if text: + if text and isinstance(text, str): return html.unescape(text) return text From 8e1bb98858f2356845ea9cb77ca8839fc79cce65 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Thu, 7 May 2020 01:52:26 +0900 Subject: [PATCH 15/23] Fix feedback bug and add features. --- src/main/python/mlsearch/api_requester.py | 27 +++- src/main/python/mlsearch/config.py | 38 ++++-- src/main/scripts/mlsearch | 146 ++++++++++++++-------- 3 files changed, 143 insertions(+), 68 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 30082d2..ca44f5d 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -44,6 +44,15 @@ def __init__(self, source, query, init_idx, count, y_next_page_token=None): "y_next_page_token": None, } + @property + def youtube_query_order(self): + return self._config.YOUTUBE_ORDER + + @youtube_query_order.setter + def youtube_query_order(self, youtube_order): + if youtube_order: + self._config.YOUTUBE_ORDER = youtube_order + @property def github_acc_token(self): return self._config.GITHUB_ACC_TOKEN @@ -217,6 +226,12 @@ def _fetch_paperwithcode(self) -> [Protocol]: def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: """Fetch the Youtube Repository""" results = [] + input_query = str(self.params["query"]).lower().strip() + user_query = input_query + + if not self._config.YOUTUBE_FIX_KEYWORD.strip() in user_query: + user_query = input_query + self._config.YOUTUBE_QUERY_FILTER + youtube = googleapiclient.discovery.build( self._config.YOUTUBE_SERVICE_NAME, self._config.YOUTUBE_API_VERSION, @@ -226,7 +241,7 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: part=self._config.YOUTUBE_PART, maxResults=self.params["count"], order=self._config.YOUTUBE_ORDER, - q=self.params["query"], + q=user_query, safeSearch=self._config.YOUTUBE_SAFESEARCH, pageToken=y_next_page_token, ) @@ -234,6 +249,10 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: if "items" in response and len(response["items"]) > 0: for item in response["items"]: + # Skip if the video id is null + if not item.get("id", dict({"videoId": None})).get("videoId", None): + continue + data = { "video_id": self._unescape( item.get("id", dict({"videoId": None})).get("videoId", None) @@ -282,6 +301,7 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: ) > 0 ) + self.data["y_query_order"] = self._config.YOUTUBE_ORDER self.data["response_code"] = 200 def fetch_data(self) -> json: @@ -295,6 +315,11 @@ def fetch_data(self) -> json: self._fetch_github() if self.params.get("source", "") == "youtube": + if not self._config.YOUTUBE_ORDER in self._config.VALID_YOUTUBE_ORDER: + self.data["response_code"] = 400 + self.data["content"] = "Invalid Youtube Query Order." + return self.data + self._fetch_youtube(self.params.get("y_next_page_token", None)) # TODO: Implement the function for Coursera. However, this function diff --git a/src/main/python/mlsearch/config.py b/src/main/python/mlsearch/config.py index c53574f..1bd805d 100644 --- a/src/main/python/mlsearch/config.py +++ b/src/main/python/mlsearch/config.py @@ -1,24 +1,38 @@ import os + class Config(object): """Class for API Request configuration.""" # Paper with code configuration - PWC_USER_NAME = os.environ.get('PWC_USER_NAME') or '' - PWC_PASSWORD = os.environ.get('PWC_PASSWORD') or '' - PWC_URL = os.environ.get('PWC_URL') or "https://paperswithcode.com/api/v0/search/?q=" + PWC_USER_NAME = os.environ.get("PWC_USER_NAME") or "" + PWC_PASSWORD = os.environ.get("PWC_PASSWORD") or "" + PWC_URL = ( + os.environ.get("PWC_URL") or "https://paperswithcode.com/api/v0/search/?q=" + ) # Github configuration - GITHUB_ACC_TOKEN = os.environ.get('GITHUB_ACC_TOKEN') or None - GITHUB_URL = os.environ.get('GITHUB_URL') or "in:readme+in:description" + GITHUB_ACC_TOKEN = os.environ.get("GITHUB_ACC_TOKEN") or None + GITHUB_URL = os.environ.get("GITHUB_URL") or "in:readme+in:description" # AIP Source - VALID_API_SOURCE = ['paperwithcode', 'github', 'coursera', 'youtube'] + VALID_API_SOURCE = ["paperwithcode", "github", "coursera", "youtube"] # Youtube configuration - YOUTUBE_SERVICE_NAME = os.environ.get('YOUTUBE_SERVICE_NAME') or "youtube" - YOUTUBE_API_VERSION = os.environ.get('YOUTUBE_API_VERSION') or "v3" - YOUTUBE_DEVELOPER_KEY = os.environ.get('YOUTUBE_DEVELOPER_KEY') or None - YOUTUBE_ORDER = os.environ.get('YOUTUBE_ORDER') or "relevance" - YOUTUBE_SAFESEARCH = os.environ.get('YOUTUBE_SAFESEARCH') or "strict" - YOUTUBE_PART = os.environ.get('YOUTUBE_PART') or "snippet" \ No newline at end of file + YOUTUBE_SERVICE_NAME = os.environ.get("YOUTUBE_SERVICE_NAME") or "youtube" + YOUTUBE_API_VERSION = os.environ.get("YOUTUBE_API_VERSION") or "v3" + YOUTUBE_DEVELOPER_KEY = os.environ.get("YOUTUBE_DEVELOPER_KEY") or None + YOUTUBE_ORDER = os.environ.get("YOUTUBE_ORDER") or "relevance" + YOUTUBE_SAFESEARCH = os.environ.get("YOUTUBE_SAFESEARCH") or "strict" + YOUTUBE_PART = os.environ.get("YOUTUBE_PART") or "snippet" + YOUTUBE_FIX_KEYWORD = "machine learning" + YOUTUBE_QUERY_FILTER = " " + YOUTUBE_FIX_KEYWORD + " -news" + VALID_YOUTUBE_ORDER = [ + "date", + "rating", + "relevance", + "title", + # "videoCount", # This is for channel only + "viewCount", + ] + diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index 0d1b98e..878f055 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -6,67 +6,101 @@ import os import json # For debugging purpose -if 'mlsearch' not in sys.modules: - sys.path.append(os.path.join(os.getcwd(), 'src/main/python')) +if "mlsearch" not in sys.modules: + sys.path.append(os.path.join(os.getcwd(), "src/main/python")) from mlsearch.api_requester import APIRequest from mlsearch import helper as hp ap = argparse.ArgumentParser() -ap.add_argument('-q', '--query', required=True, help="Keyword for searching.") -ap.add_argument('-i', '--init_idx', required=True, help="Initial index for pagination.") -ap.add_argument('-c', '--count', required=True, help="Total number of results to be fetched.") -ap.add_argument('-s', '--source', required=True, help="Source API to be looking for.") -ap.add_argument('-ck', '--cookies', required=True, help="Cookies of current user.") -ap.add_argument('-tm', '--timestamp', required=True, help="Timestamp of requesting API.") -ap.add_argument('-pu', '--pwc_user', required=False, help="Paper with code repository user name.") -ap.add_argument('-pp', '--pwc_password', required=False, help="Paper with code repository password.") -ap.add_argument('-gt', '--github_acc_token', required=False, help="Github access token.") -ap.add_argument('-yk', '--youtube_dev_key', required=False, help="Youtube developer key.") -ap.add_argument('-yntp', '--y_next_page_token', required=False, help="Next page token for Youtube API.") +ap.add_argument("-q", "--query", required=True, help="Keyword for searching.") +ap.add_argument("-i", "--init_idx", required=True, help="Initial index for pagination.") +ap.add_argument( + "-c", "--count", required=True, help="Total number of results to be fetched." +) +ap.add_argument("-s", "--source", required=True, help="Source API to be looking for.") +ap.add_argument("-ck", "--cookies", required=True, help="Cookies of current user.") +ap.add_argument( + "-tm", "--timestamp", required=True, help="Timestamp of requesting API." +) +ap.add_argument( + "-pu", "--pwc_user", required=False, help="Paper with code repository user name." +) +ap.add_argument( + "-pp", "--pwc_password", required=False, help="Paper with code repository password." +) +ap.add_argument( + "-gt", "--github_acc_token", required=False, help="Github access token." +) +ap.add_argument( + "-yk", "--youtube_dev_key", required=False, help="Youtube developer key." +) +ap.add_argument( + "-yntp", + "--y_next_page_token", + required=False, + help="Next page token for Youtube API.", +) +ap.add_argument( + "-yo", "--youtube_query_order", required=False, help="Youtube Query Order." +) args = vars(ap.parse_args()) + def main(event): headers = { - 'Access-Control-Allow-Origin': '*', - 'X-Requested-With': '*', - 'Access-Control-Allow-Headers': 'Content-Type,X-Amz-Date,Authorization,X-Api-Key,x-requested-with', - 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET' - } + "Access-Control-Allow-Origin": "*", + "X-Requested-With": "*", + "Access-Control-Allow-Headers": "Content-Type,X-Amz-Date,Authorization,X-Api-Key,x-requested-with", + "Access-Control-Allow-Methods": "OPTIONS,POST,GET", + } try: param_names = [ - 'query', 'init_idx', - 'count', 'source', - 'cookies', 'timestamp', - 'y_next_page_token'] - response_msg = hp.response('success', 200) + "query", + "init_idx", + "count", + "source", + "cookies", + "timestamp", + "y_next_page_token", + ] + response_msg = hp.response("success", 200) if hp.is_valid_parameters(event, param_names): params = hp.parse_parameters(event) if params.values(): api_request = APIRequest( - params['source'], - params['query'], - params['init_idx'], - params['count'], - params['y_next_page_token']) - if 'pwc_user'in event and 'pwc_password' in event: - api_request.pwc_auth_info = (event['pwc_user'], event['pwc_password']) - if 'github_acc_token' in event: - api_request.github_acc_token = event['github_acc_token'] - if 'youtube_developer_key' in event: - api_request.youtube_developer_key = event['youtube_developer_key'] + params["source"], + params["query"], + params["init_idx"], + params["count"], + params["y_next_page_token"], + ) + if "pwc_user" in event and "pwc_password" in event: + api_request.pwc_auth_info = ( + event["pwc_user"], + event["pwc_password"], + ) + if "github_acc_token" in event: + api_request.github_acc_token = event["github_acc_token"] + if "youtube_developer_key" in event: + api_request.youtube_developer_key = event["youtube_developer_key"] + if "youtube_query_order" in event: + api_request.youtube_query_order = event["youtube_query_order"] data = api_request.fetch_data() response_msg = hp.response( - message=data.get('content',''), - status_code=data.get('response_code'), + message=data.get("content", ""), + status_code=data.get("response_code"), headers=headers, optional_attributes={ - 'has_next_page': data.get('has_next_page', False), - 'y_next_page_token': data.get('y_next_page_token', None)}) + "has_next_page": data.get("has_next_page", False), + "y_next_page_token": data.get("y_next_page_token", None), + "y_query_order": data.get("y_query_order", None) + }, + ) return response_msg - response_msg = hp.response('Invalid parameters.', 400) + response_msg = hp.response("Invalid parameters.", 400) return response_msg except (ValueError, TypeError) as ex: @@ -77,26 +111,28 @@ def main(event): response_msg = hp.response(str(ex), 500) return response_msg + if __name__ == "__main__": event = { - 'query': args['query'], - 'init_idx': args['init_idx'], - 'count': args['count'], - 'source': args['source'], - 'cookies': args['cookies'], - 'timestamp': args['timestamp'], - 'y_next_page_token': args['y_next_page_token'] + "query": args["query"], + "init_idx": args["init_idx"], + "count": args["count"], + "source": args["source"], + "cookies": args["cookies"], + "timestamp": args["timestamp"], + "y_next_page_token": args["y_next_page_token"], } - if args['pwc_user']: - event['pwc_user'] = args['pwc_user'] - if args['pwc_password']: - event['pwc_password'] = args['pwc_password'] - if args['github_acc_token']: - event['github_acc_token'] = args['github_acc_token'] - if args['youtube_dev_key']: - event['youtube_developer_key'] = args['youtube_dev_key'] - + if args["pwc_user"]: + event["pwc_user"] = args["pwc_user"] + if args["pwc_password"]: + event["pwc_password"] = args["pwc_password"] + if args["github_acc_token"]: + event["github_acc_token"] = args["github_acc_token"] + if args["youtube_dev_key"]: + event["youtube_developer_key"] = args["youtube_dev_key"] + if args["youtube_query_order"]: + event["youtube_query_order"] = args["youtube_query_order"] result = main(event) pp = pprint.PrettyPrinter(indent=2) pp.pprint(result) From 46bf6f6e39f5dbfd62e8bec4e30949bcf24f3a4f Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Thu, 7 May 2020 01:59:50 +0900 Subject: [PATCH 16/23] Fix ambiguous youtube query order keyword. --- src/main/scripts/mlsearch | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index 878f055..527b615 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -42,7 +42,7 @@ ap.add_argument( help="Next page token for Youtube API.", ) ap.add_argument( - "-yo", "--youtube_query_order", required=False, help="Youtube Query Order." + "-yo", "--y_query_order", required=False, help="Youtube Query Order." ) args = vars(ap.parse_args()) @@ -84,8 +84,8 @@ def main(event): api_request.github_acc_token = event["github_acc_token"] if "youtube_developer_key" in event: api_request.youtube_developer_key = event["youtube_developer_key"] - if "youtube_query_order" in event: - api_request.youtube_query_order = event["youtube_query_order"] + if "y_query_order" in event: + api_request.youtube_query_order = event["y_query_order"] data = api_request.fetch_data() response_msg = hp.response( message=data.get("content", ""), @@ -131,8 +131,8 @@ if __name__ == "__main__": event["github_acc_token"] = args["github_acc_token"] if args["youtube_dev_key"]: event["youtube_developer_key"] = args["youtube_dev_key"] - if args["youtube_query_order"]: - event["youtube_query_order"] = args["youtube_query_order"] + if args["y_query_order"]: + event["y_query_order"] = args["y_query_order"] result = main(event) pp = pprint.PrettyPrinter(indent=2) pp.pprint(result) From 9abf08fda1d98ca010b159548e6ff6b31a74c04e Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Thu, 7 May 2020 02:19:02 +0900 Subject: [PATCH 17/23] Add usage to README file. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c425d65..da9ec19 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ Optional Parameters: Youtube developer key. -ynpt NEXT_PAGE_TOKEN, --y_next_page_token NEXT_PAGE_TOKEN Next page token for Youtube API. + -yo Y_QUERY_ORDER, --y_query_order Y_QUERY_ORDER + Youtube Query Order. ```
From 569a8dc855857901be513b75ff32c19898e168ce Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 17 May 2020 03:39:33 +0900 Subject: [PATCH 18/23] Fix YouTube response result and Exceptions. --- src/main/python/mlsearch/api_requester.py | 65 +++++++++++++++++------ src/main/python/mlsearch/config.py | 10 +++- src/main/scripts/mlsearch | 10 ++-- 3 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index ca44f5d..9becbe0 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -2,10 +2,14 @@ from mlsearch.protocol import Protocol from github import Github from requests.auth import HTTPBasicAuth +from github.GithubException import BadCredentialsException +from github.GithubException import RateLimitExceededException +from googleapiclient.errors import HttpError import googleapiclient.discovery import json import requests import html +import random # import scholarly @@ -68,8 +72,12 @@ def youtube_developer_key(self): @youtube_developer_key.setter def youtube_developer_key(self, developer_key): - if developer_key: + if isinstance(developer_key, list): self._config.YOUTUBE_DEVELOPER_KEY = developer_key + elif isinstance(developer_key, str) and "," in developer_key: + self._config.YOUTUBE_DEVELOPER_KEY = developer_key.strip().split(",") + elif developer_key and isinstance(developer_key, str): + self._config.YOUTUBE_DEVELOPER_KEY.append(developer_key) @property def pwc_auth_info(self): @@ -220,8 +228,13 @@ def _fetch_paperwithcode(self) -> [Protocol]: results.append(Protocol(data)) self.data["content"] = [proto.to_JSON() for proto in results] - - self.data["response_code"] = query_result.status_code + else: + print(str(query_result.status_code), query_result.content) + self.data["response_code"] = query_result.status_code + self.data["content"] = ( + "There is an error in fetching data from PWC server." + f" {json.loads(query_result.content).get('error')}" + ) def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: """Fetch the Youtube Repository""" @@ -232,18 +245,23 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: if not self._config.YOUTUBE_FIX_KEYWORD.strip() in user_query: user_query = input_query + self._config.YOUTUBE_QUERY_FILTER + sampled_dev_key = None + if len(self._config.YOUTUBE_DEVELOPER_KEY) > 0: + sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) youtube = googleapiclient.discovery.build( self._config.YOUTUBE_SERVICE_NAME, self._config.YOUTUBE_API_VERSION, - developerKey=self._config.YOUTUBE_DEVELOPER_KEY, + developerKey=sampled_dev_key, ) + request = youtube.search().list( part=self._config.YOUTUBE_PART, maxResults=self.params["count"], order=self._config.YOUTUBE_ORDER, q=user_query, safeSearch=self._config.YOUTUBE_SAFESEARCH, - pageToken=y_next_page_token, + # Disabled the next page token due to limitation of api access. + # pageToken=y_next_page_token, ) response = request.execute() @@ -293,14 +311,15 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: "source": self.params.get("source", ""), } results.append(Protocol(data)) - self.data["y_next_page_token"] = response.get("nextPageToken", None) + # self.data["y_next_page_token"] = response.get("nextPageToken", None) self.data["content"] = [proto.to_JSON() for proto in results] - self.data["has_next_page"] = ( - response.get("pageInfo", dict({"totalResults": 0})).get( - "totalResults", 0 - ) - > 0 - ) + # self.data["has_next_page"] = ( + # response.get("pageInfo", dict({"totalResults": 0})).get( + # "totalResults", 0 + # ) + # > 0 + # ) + self.data["has_next_page"] = False self.data["y_query_order"] = self._config.YOUTUBE_ORDER self.data["response_code"] = 200 @@ -312,15 +331,28 @@ def fetch_data(self) -> json: self._fetch_paperwithcode() if self.params.get("source", "") == "github": - self._fetch_github() + try: + self._fetch_github() + except BadCredentialsException: + self.data["response_code"] = 400 + self.data["content"] = "Invalid Github developer key." + except RateLimitExceededException: + self.data["response_code"] = 503 + self.data["content"] = "Access rate limitation reached." if self.params.get("source", "") == "youtube": if not self._config.YOUTUBE_ORDER in self._config.VALID_YOUTUBE_ORDER: self.data["response_code"] = 400 self.data["content"] = "Invalid Youtube Query Order." return self.data - - self._fetch_youtube(self.params.get("y_next_page_token", None)) + try: + self._fetch_youtube(self.params.get("y_next_page_token", None)) + except HttpError as ex: + print(str(ex)) + self.data["response_code"] = 400 + self.data[ + "content" + ] = "Seems there is an authentication error with Youtube server." # TODO: Implement the function for Coursera. However, this function # may be handled by the backend server. @@ -328,7 +360,8 @@ def fetch_data(self) -> json: pass except Exception as ex: + print(str(ex)) + self.data["content"] = "Oops... Something has gone wrong in server." self.data["response_code"] = 500 - self.data["content"] = str(ex) return self.data diff --git a/src/main/python/mlsearch/config.py b/src/main/python/mlsearch/config.py index 1bd805d..729e4ec 100644 --- a/src/main/python/mlsearch/config.py +++ b/src/main/python/mlsearch/config.py @@ -21,7 +21,15 @@ class Config(object): # Youtube configuration YOUTUBE_SERVICE_NAME = os.environ.get("YOUTUBE_SERVICE_NAME") or "youtube" YOUTUBE_API_VERSION = os.environ.get("YOUTUBE_API_VERSION") or "v3" - YOUTUBE_DEVELOPER_KEY = os.environ.get("YOUTUBE_DEVELOPER_KEY") or None + # Parsing Youtube Keys + YOUTUBE_DEVELOPER_KEY = list() + developer_key = os.environ.get("YOUTUBE_DEVELOPER_KEY") + if isinstance(developer_key, list): + YOUTUBE_DEVELOPER_KEY = developer_key + elif isinstance(developer_key, str) and "," in developer_key: + YOUTUBE_DEVELOPER_KEY = developer_key.strip().split(",") + elif developer_key and isinstance(developer_key, str): + YOUTUBE_DEVELOPER_KEY.append(developer_key) YOUTUBE_ORDER = os.environ.get("YOUTUBE_ORDER") or "relevance" YOUTUBE_SAFESEARCH = os.environ.get("YOUTUBE_SAFESEARCH") or "strict" YOUTUBE_PART = os.environ.get("YOUTUBE_PART") or "snippet" diff --git a/src/main/scripts/mlsearch b/src/main/scripts/mlsearch index 527b615..7ae2448 100644 --- a/src/main/scripts/mlsearch +++ b/src/main/scripts/mlsearch @@ -33,7 +33,7 @@ ap.add_argument( "-gt", "--github_acc_token", required=False, help="Github access token." ) ap.add_argument( - "-yk", "--youtube_dev_key", required=False, help="Youtube developer key." + "-yk", "--y_dev_key", required=False, help="Youtube developer key." ) ap.add_argument( "-yntp", @@ -82,8 +82,8 @@ def main(event): ) if "github_acc_token" in event: api_request.github_acc_token = event["github_acc_token"] - if "youtube_developer_key" in event: - api_request.youtube_developer_key = event["youtube_developer_key"] + if "y_dev_key" in event: + api_request.youtube_developer_key = event["y_dev_key"] if "y_query_order" in event: api_request.youtube_query_order = event["y_query_order"] data = api_request.fetch_data() @@ -129,8 +129,8 @@ if __name__ == "__main__": event["pwc_password"] = args["pwc_password"] if args["github_acc_token"]: event["github_acc_token"] = args["github_acc_token"] - if args["youtube_dev_key"]: - event["youtube_developer_key"] = args["youtube_dev_key"] + if args["y_dev_key"]: + event["y_dev_key"] = args["y_dev_key"] if args["y_query_order"]: event["y_query_order"] = args["y_query_order"] result = main(event) From 9c52cf512b65d908db1097485b405a8c0e375e56 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 24 May 2020 15:16:27 +0900 Subject: [PATCH 19/23] Check Auth Error on YouTube before happening. --- src/main/python/mlsearch/api_requester.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index 9becbe0..f07413d 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -10,9 +10,11 @@ import requests import html import random +import collections # import scholarly +ErrorType = collections.namedtuple("ErrorType", "reason status") class APIRequest: """For handling the different Valid API requests.""" @@ -248,6 +250,13 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: sampled_dev_key = None if len(self._config.YOUTUBE_DEVELOPER_KEY) > 0: sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) + + if not sampled_dev_key: + auth_error = ErrorType( + reason="Empty YouTube Developer Key.", status="400" + ) + raise HttpError(auth_error, str.encode("YouTube Developer Key Required.")) + youtube = googleapiclient.discovery.build( self._config.YOUTUBE_SERVICE_NAME, self._config.YOUTUBE_API_VERSION, From 82bb9a1a5dc8accb467fcfff8f6574064cc351d1 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 24 May 2020 22:51:57 +0900 Subject: [PATCH 20/23] Improve Github Pagnition Logic. --- src/main/python/mlsearch/api_requester.py | 30 ++++++++++++++++------- src/main/python/mlsearch/config.py | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index f07413d..fe973aa 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -11,11 +11,13 @@ import html import random import collections +import math # import scholarly ErrorType = collections.namedtuple("ErrorType", "reason status") + class APIRequest: """For handling the different Valid API requests.""" @@ -140,8 +142,13 @@ def _unescape(self, text): def _fetch_github(self) -> [Protocol]: """Fetch Github Repository""" + per_page = self._config.GITHUB_PER_PAGE + github = Github(self._config.GITHUB_ACC_TOKEN, per_page=per_page) - github = Github(self._config.GITHUB_ACC_TOKEN) + skip_page = math.floor(self.params["init_idx"] / per_page) + total_page = math.ceil( + (self.params["init_idx"] + self.params["count"]) / per_page + ) query = "+".join([self.params["query"], self._config.GITHUB_URL]) responses = github.search_repositories(query, "stars", "desc") results = [] @@ -149,12 +156,19 @@ def _fetch_github(self) -> [Protocol]: if not self._is_valid_pagination(responses.totalCount): return - for response in responses[ - self.params["init_idx"] : min( - self.params["init_idx"] + self.params["count"], responses.totalCount - ) - ]: + paginated_responses = list() + for i in range(skip_page + 1, total_page + 1): + paginated_responses.extend(responses.get_page(i)) + first_slot_items = per_page - (self.params["init_idx"] % per_page) + end_slot_items = per_page - ( + (total_page * per_page) - (self.params["count"] + self.params["init_idx"]) + ) + + start_idx = per_page - first_slot_items + end_idx = (len(paginated_responses) - per_page) + end_slot_items + + for response in paginated_responses[start_idx:end_idx]: data = { "repository_url": self._unescape( response.clone_url.replace(".git", "") @@ -252,9 +266,7 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) if not sampled_dev_key: - auth_error = ErrorType( - reason="Empty YouTube Developer Key.", status="400" - ) + auth_error = ErrorType(reason="Empty YouTube Developer Key.", status="400") raise HttpError(auth_error, str.encode("YouTube Developer Key Required.")) youtube = googleapiclient.discovery.build( diff --git a/src/main/python/mlsearch/config.py b/src/main/python/mlsearch/config.py index 729e4ec..ec2e5c4 100644 --- a/src/main/python/mlsearch/config.py +++ b/src/main/python/mlsearch/config.py @@ -14,7 +14,7 @@ class Config(object): # Github configuration GITHUB_ACC_TOKEN = os.environ.get("GITHUB_ACC_TOKEN") or None GITHUB_URL = os.environ.get("GITHUB_URL") or "in:readme+in:description" - + GITHUB_PER_PAGE = os.environ.get("PER_PAGE") or 10 # AIP Source VALID_API_SOURCE = ["paperwithcode", "github", "coursera", "youtube"] From e0270f49da2dab6b54adac94b63c0f952046d919 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Mon, 25 May 2020 10:47:11 +0900 Subject: [PATCH 21/23] Fix YouTube response result and Exceptions. --- src/main/python/mlsearch/api_requester.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index f07413d..a1a9a8b 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -248,15 +248,14 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: user_query = input_query + self._config.YOUTUBE_QUERY_FILTER sampled_dev_key = None - if len(self._config.YOUTUBE_DEVELOPER_KEY) > 0: - sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) - - if not sampled_dev_key: + if not len(self._config.YOUTUBE_DEVELOPER_KEY) > 0: auth_error = ErrorType( reason="Empty YouTube Developer Key.", status="400" ) raise HttpError(auth_error, str.encode("YouTube Developer Key Required.")) + sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) + youtube = googleapiclient.discovery.build( self._config.YOUTUBE_SERVICE_NAME, self._config.YOUTUBE_API_VERSION, From 0652bd69062056e1eee860ba3ba399159feabe8f Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Mon, 25 May 2020 10:47:11 +0900 Subject: [PATCH 22/23] Fix YouTube response result and Exceptions. Fix YouTube response result and Exceptions. --- src/main/python/mlsearch/api_requester.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index fe973aa..f97ce9d 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -262,13 +262,14 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: user_query = input_query + self._config.YOUTUBE_QUERY_FILTER sampled_dev_key = None - if len(self._config.YOUTUBE_DEVELOPER_KEY) > 0: - sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) - - if not sampled_dev_key: - auth_error = ErrorType(reason="Empty YouTube Developer Key.", status="400") + if not len(self._config.YOUTUBE_DEVELOPER_KEY) > 0: + auth_error = ErrorType( + reason="Empty YouTube Developer Key.", status="400" + ) raise HttpError(auth_error, str.encode("YouTube Developer Key Required.")) + sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) + youtube = googleapiclient.discovery.build( self._config.YOUTUBE_SERVICE_NAME, self._config.YOUTUBE_API_VERSION, From e68a8d9477b8661f9a018571387adcec9ffe3479 Mon Sep 17 00:00:00 2001 From: saihtaungkham <31495612+saihtaungkham@users.noreply.github.com> Date: Sun, 31 May 2020 23:00:13 +0900 Subject: [PATCH 23/23] Fix variable name and code formatting. --- src/main/python/mlsearch/api_requester.py | 72 +++++++++++++++-------- src/main/python/mlsearch/config.py | 6 +- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/src/main/python/mlsearch/api_requester.py b/src/main/python/mlsearch/api_requester.py index f97ce9d..cdb6981 100644 --- a/src/main/python/mlsearch/api_requester.py +++ b/src/main/python/mlsearch/api_requester.py @@ -79,7 +79,9 @@ def youtube_developer_key(self, developer_key): if isinstance(developer_key, list): self._config.YOUTUBE_DEVELOPER_KEY = developer_key elif isinstance(developer_key, str) and "," in developer_key: - self._config.YOUTUBE_DEVELOPER_KEY = developer_key.strip().split(",") + self._config.YOUTUBE_DEVELOPER_KEY = developer_key.strip().split( + "," + ) elif developer_key and isinstance(developer_key, str): self._config.YOUTUBE_DEVELOPER_KEY.append(developer_key) @@ -110,7 +112,9 @@ def _validate_params(self): """Validate user input data.""" for item, typ in self.params_model.items(): - if item in self.params.keys() and not typ == type(self.params[item]): + if item in self.params.keys() and not typ == type( + self.params[item] + ): raise TypeError( f"Invalid type for {item}. {typ} is expected but " f"{type(self.params[item])} is given." @@ -142,12 +146,12 @@ def _unescape(self, text): def _fetch_github(self) -> [Protocol]: """Fetch Github Repository""" - per_page = self._config.GITHUB_PER_PAGE - github = Github(self._config.GITHUB_ACC_TOKEN, per_page=per_page) + item_per_page = self._config.GITHUB_PER_PAGE + github = Github(self._config.GITHUB_ACC_TOKEN, per_page=item_per_page) - skip_page = math.floor(self.params["init_idx"] / per_page) + skip_page = math.floor(self.params["init_idx"] / item_per_page) total_page = math.ceil( - (self.params["init_idx"] + self.params["count"]) / per_page + (self.params["init_idx"] + self.params["count"]) / item_per_page ) query = "+".join([self.params["query"], self._config.GITHUB_URL]) responses = github.search_repositories(query, "stars", "desc") @@ -160,13 +164,16 @@ def _fetch_github(self) -> [Protocol]: for i in range(skip_page + 1, total_page + 1): paginated_responses.extend(responses.get_page(i)) - first_slot_items = per_page - (self.params["init_idx"] % per_page) - end_slot_items = per_page - ( - (total_page * per_page) - (self.params["count"] + self.params["init_idx"]) + first_slot_items = item_per_page - ( + self.params["init_idx"] % item_per_page + ) + end_slot_items = item_per_page - ( + (total_page * item_per_page) + - (self.params["count"] + self.params["init_idx"]) ) - start_idx = per_page - first_slot_items - end_idx = (len(paginated_responses) - per_page) + end_slot_items + start_idx = item_per_page - first_slot_items + end_idx = (len(paginated_responses) - item_per_page) + end_slot_items for response in paginated_responses[start_idx:end_idx]: data = { @@ -198,7 +205,9 @@ def _fetch_paperwithcode(self) -> [Protocol]: url = f"{self._config.PWC_URL}{self.params['query']}" query_result = requests.get( url, - auth=HTTPBasicAuth(self._config.PWC_USER_NAME, self._config.PWC_PASSWORD), + auth=HTTPBasicAuth( + self._config.PWC_USER_NAME, self._config.PWC_PASSWORD + ), ) if query_result.status_code == 200: @@ -216,7 +225,9 @@ def _fetch_paperwithcode(self) -> [Protocol]: for item in content: data = { "title": self._unescape(item.get("paper_title", None)), - "description": self._unescape(item.get("paper_abstract", None)), + "description": self._unescape( + item.get("paper_abstract", None) + ), "paper_url": self._unescape(item.get("paper_url", None)), "num_of_implementations": self._unescape( item.get("number_of_implementations", None) @@ -225,7 +236,9 @@ def _fetch_paperwithcode(self) -> [Protocol]: "paper_conference": self._unescape( item.get("paper_conference", None) ), - "repository_url": self._unescape(item.get("repository_url", None)), + "repository_url": self._unescape( + item.get("repository_url", None) + ), "repository_name": self._unescape( item.get("repository_name", None) ), @@ -266,7 +279,9 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: auth_error = ErrorType( reason="Empty YouTube Developer Key.", status="400" ) - raise HttpError(auth_error, str.encode("YouTube Developer Key Required.")) + raise HttpError( + auth_error, str.encode("YouTube Developer Key Required.") + ) sampled_dev_key = random.choice(self._config.YOUTUBE_DEVELOPER_KEY) @@ -290,15 +305,21 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: if "items" in response and len(response["items"]) > 0: for item in response["items"]: # Skip if the video id is null - if not item.get("id", dict({"videoId": None})).get("videoId", None): + if not item.get("id", dict({"videoId": None})).get( + "videoId", None + ): continue data = { "video_id": self._unescape( - item.get("id", dict({"videoId": None})).get("videoId", None) + item.get("id", dict({"videoId": None})).get( + "videoId", None + ) ), "title": self._unescape( - item.get("snippet", dict({"title": None})).get("title", None) + item.get("snippet", dict({"title": None})).get( + "title", None + ) ), "description": self._unescape( item.get("snippet", dict({"description": None})).get( @@ -316,9 +337,9 @@ def _fetch_youtube(self, y_next_page_token=None) -> [Protocol]: ) ), "live_broadcast_content": self._unescape( - item.get("snippet", dict({"liveBroadcastContent": None})).get( - "liveBroadcastContent", None - ) + item.get( + "snippet", dict({"liveBroadcastContent": None}) + ).get("liveBroadcastContent", None) ), "published_datetime": self._unescape( item.get("snippet", dict({"publishedAt": None})).get( @@ -363,12 +384,17 @@ def fetch_data(self) -> json: self.data["content"] = "Access rate limitation reached." if self.params.get("source", "") == "youtube": - if not self._config.YOUTUBE_ORDER in self._config.VALID_YOUTUBE_ORDER: + if ( + not self._config.YOUTUBE_ORDER + in self._config.VALID_YOUTUBE_ORDER + ): self.data["response_code"] = 400 self.data["content"] = "Invalid Youtube Query Order." return self.data try: - self._fetch_youtube(self.params.get("y_next_page_token", None)) + self._fetch_youtube( + self.params.get("y_next_page_token", None) + ) except HttpError as ex: print(str(ex)) self.data["response_code"] = 400 diff --git a/src/main/python/mlsearch/config.py b/src/main/python/mlsearch/config.py index ec2e5c4..48e9b66 100644 --- a/src/main/python/mlsearch/config.py +++ b/src/main/python/mlsearch/config.py @@ -8,13 +8,14 @@ class Config(object): PWC_USER_NAME = os.environ.get("PWC_USER_NAME") or "" PWC_PASSWORD = os.environ.get("PWC_PASSWORD") or "" PWC_URL = ( - os.environ.get("PWC_URL") or "https://paperswithcode.com/api/v0/search/?q=" + os.environ.get("PWC_URL") + or "https://paperswithcode.com/api/v0/search/?q=" ) # Github configuration GITHUB_ACC_TOKEN = os.environ.get("GITHUB_ACC_TOKEN") or None GITHUB_URL = os.environ.get("GITHUB_URL") or "in:readme+in:description" - GITHUB_PER_PAGE = os.environ.get("PER_PAGE") or 10 + GITHUB_PER_PAGE = os.environ.get("ITEM_PER_PAGE") or 10 # AIP Source VALID_API_SOURCE = ["paperwithcode", "github", "coursera", "youtube"] @@ -43,4 +44,3 @@ class Config(object): # "videoCount", # This is for channel only "viewCount", ] -