From b7b23295d5035c008d8deb815eac2ecfee61524e Mon Sep 17 00:00:00 2001 From: Matthias Jacob Date: Thu, 21 Jan 2021 00:53:00 +0100 Subject: [PATCH 1/6] fix revision references --- test_changes.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test_changes.sh b/test_changes.sh index 3fda8b9..d3a62da 100755 --- a/test_changes.sh +++ b/test_changes.sh @@ -2,7 +2,16 @@ set -e -CHANGED_SCRAPERS=$(git whatchanged --name-only --pretty="" origin..HEAD | +if [ $CI ] +then + HEAD_REF=${GITHUB_REF} +else + HEAD_REF="HEAD" +fi + +echo "Using head reference: ${HEAD_REF}" + +CHANGED_SCRAPERS=$(git whatchanged --name-only --pretty="" origin/master..${HEAD_REF} | grep spiders | grep -v helper | sed 's/jedeschule\/spiders\///' | From 2ff9901620fa27b0c469a95a8aa7fdf4398d5e46 Mon Sep 17 00:00:00 2001 From: Matthias Jacob Date: Thu, 21 Jan 2021 01:51:04 +0100 Subject: [PATCH 2/6] fetch all git history in order to be able to compare with master --- .github/workflows/test_scraper_changes.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test_scraper_changes.yml b/.github/workflows/test_scraper_changes.yml index 1206ac5..fc4eacf 100644 --- a/.github/workflows/test_scraper_changes.yml +++ b/.github/workflows/test_scraper_changes.yml @@ -8,6 +8,8 @@ jobs: name: Show changes in scraper results steps: - uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Setup python uses: actions/setup-python@v1 with: From e6e879183a3e80b86d04f4f85b0448efcd247d55 Mon Sep 17 00:00:00 2001 From: Matthias Jacob Date: Thu, 21 Jan 2021 02:07:58 +0100 Subject: [PATCH 3/6] upgrade scrapy --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9b9cda6..322ede4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ alembic==1.3.3 -Scrapy==1.7.3 +Scrapy==2.4.1 requests==2.20.0 wget==3.2 xlrd==1.1.0 From 67911825907c47c76bcd17685d7a9bc6af426508 Mon Sep 17 00:00:00 2001 From: Matthias Jacob Date: Thu, 21 Jan 2021 02:26:04 +0100 Subject: [PATCH 4/6] lazy session initialization --- jedeschule/pipelines/db_pipeline.py | 14 +++++++++++--- test_models.py | 4 +++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/jedeschule/pipelines/db_pipeline.py b/jedeschule/pipelines/db_pipeline.py index fe6939f..0175cb5 100644 --- a/jedeschule/pipelines/db_pipeline.py +++ b/jedeschule/pipelines/db_pipeline.py @@ -11,9 +11,14 @@ from jedeschule.pipelines.school_pipeline import SchoolPipelineItem Base = declarative_base() -engine = create_engine(os.environ.get("DATABASE_URL"), echo=False) -Session = sessionmaker(bind=engine) -session = Session() + + +def get_session(): + engine = create_engine(os.environ.get("DATABASE_URL"), echo=False) + Session = sessionmaker(bind=engine) + session = Session() + + return session class School(Base): @@ -36,6 +41,8 @@ class School(Base): @staticmethod def update_or_create(item: SchoolPipelineItem) -> School: + session = get_session() + school = session.query(School).get(item.info['id']) if school: session.query(School).filter_by(id=item.info['id']).update({**item.info, 'raw': item.item}) @@ -48,6 +55,7 @@ class DatabasePipeline(object): def process_item(self, item, spider): school = School.update_or_create(item) try: + session = get_session() session.add(school) session.commit() except SQLAlchemyError as e: diff --git a/test_models.py b/test_models.py index 44ddd7d..08110aa 100644 --- a/test_models.py +++ b/test_models.py @@ -7,7 +7,7 @@ from jedeschule.items import School from jedeschule.pipelines.school_pipeline import SchoolPipelineItem -from jedeschule.pipelines.db_pipeline import School as DBSchool, session +from jedeschule.pipelines.db_pipeline import School as DBSchool, get_session class TestSchoolItem(Item): @@ -23,6 +23,7 @@ def test_import_new(self): item = dict(name='Test Schule', nr=1) school_item: SchoolPipelineItem = SchoolPipelineItem(info=info, item=item) db_item = DBSchool.update_or_create(school_item) + session = get_session() session.add(db_item) session.commit() @@ -40,6 +41,7 @@ def test_import_existing(self): item = dict(name='Test Schule', nr=1) school_item: SchoolPipelineItem = SchoolPipelineItem(info=info, item=item) db_item = DBSchool.update_or_create(school_item) + session = get_session() session.add(db_item) session.commit() From 91d3034d8b18617ed3196550dc82e0bbabc1e802 Mon Sep 17 00:00:00 2001 From: Matthias Jacob Date: Thu, 21 Jan 2021 03:22:49 +0100 Subject: [PATCH 5/6] improve comparison script 1. output the HTTP error to make it easier to see what's happening 2. always output the NEW data, even in case of an HTTP error --- test_changes.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test_changes.py b/test_changes.py index f7d9030..c00cb3e 100644 --- a/test_changes.py +++ b/test_changes.py @@ -46,12 +46,15 @@ def main(): data = load_data() for school in data[:10]: school_id = school.get('info').get('id') + + upstream_data = {} try: upstream_data = fetch_data(school_id) upstream_data.pop('raw') - compare_schools(school.get('info'), upstream_data) - except HTTPError: - print(f"Could not fetch old data for school-id {school_id}") + except HTTPError as e: + print(f"Could not fetch old data for school-id {school_id}: {e}") + + compare_schools(school.get('info'), upstream_data) if __name__ == "__main__": From 6ed8dbbcdaf1803f2947e411c919fde9074bfd06 Mon Sep 17 00:00:00 2001 From: Matthias Jacob Date: Thu, 21 Jan 2021 03:29:59 +0100 Subject: [PATCH 6/6] trying to make comparison log more readable --- test_changes.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test_changes.py b/test_changes.py index c00cb3e..894aeec 100644 --- a/test_changes.py +++ b/test_changes.py @@ -24,8 +24,6 @@ def get_clean_item(data): def compare_schools(new_school, old_school): - print() - print(f"Comparing {new_school.get('id')}") new_school = sort_dict(new_school) old_school = sort_dict(old_school) @@ -47,12 +45,16 @@ def main(): for school in data[:10]: school_id = school.get('info').get('id') + print() + print('#'*10, f'Comparing {school_id}') + upstream_data = {} try: upstream_data = fetch_data(school_id) upstream_data.pop('raw') except HTTPError as e: - print(f"Could not fetch old data for school-id {school_id}: {e}") + print(f"WARN: Could not fetch old data for school-id {school_id}: {e}") + print() compare_schools(school.get('info'), upstream_data)