From 7dbc878294c61467a9785a0e7b1257cec0532699 Mon Sep 17 00:00:00 2001 From: Chales Horn Date: Thu, 6 Apr 2023 13:10:38 +1200 Subject: [PATCH 1/4] add further tests for sn, sl, nd abbreviations --- .../bin_expect/test-publish-sn-sl-nd.json | 22 ++++++++++++ .../bin_expect/test-publish-sn-sl.json | 35 +++++++++++++++++++ .../bin_input/test-publish-sn-sl-nd.mrc | 1 + .../bin_input/test-publish-sn-sl.mrc | 1 + openlibrary/catalog/marc/tests/test_parse.py | 2 ++ 5 files changed, 61 insertions(+) create mode 100644 openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json create mode 100644 openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json create mode 100644 openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl-nd.mrc create mode 100644 openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc diff --git a/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json new file mode 100644 index 00000000000..a3fd054e372 --- /dev/null +++ b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json @@ -0,0 +1,22 @@ +{ + "publish_country": "is", + "languages": [ + "heb" + ], + "title": "Sefer taharat Yosef", + "subtitle": "al hakalhot taharat ha-mish pahah: Rabi \u02bbOvadyah Yosef ...", + "authors": [ + { + "name": "Yosef, Ovadia", + "entity_type": "person", + "personal_name": "Yosef, Ovadia" + } + ], + "date": "[n.d.]", + "publishers": [ + "[s.n.]" + ], + "publish_places": [ + "[s.l.]" + ] +} diff --git a/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json new file mode 100644 index 00000000000..68c0875a7e0 --- /dev/null +++ b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json @@ -0,0 +1,35 @@ +{ + "publish_country": "xx", + "languages": [ + "eng" + ], + "title": "Indirect results of missionary labor in northern Turkey", + "by_statement": "by E.E. Bliss, D.D., of Constantinople", + "authors": [ + { + "name": "Bliss, E. E.", + "entity_type": "person", + "personal_name": "Bliss, E. E." + } + ], + "oclc_numbers": [ + "61406084" + ], + "lc_classifications": [ + "BV3170 .B65 1900" + ], + "notes": "Caption title.", + "subject_places": [ + "Turkey" + ], + "subjects": [ + "Missions" + ], + "publishers": [ + "[s.n.]" + ], + "publish_places": [ + "[s.l.]" + ], + "pagination": "7 pages" +} diff --git a/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl-nd.mrc b/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl-nd.mrc new file mode 100644 index 00000000000..ca1ea5c4d8b --- /dev/null +++ b/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl-nd.mrc @@ -0,0 +1 @@ +00315cam a2200109 a 4500001000800000005001700008008004100025245008500066260002700151700001900178852000800197527654020050406103838.0050406m is b 000 0 heb d10aSefer taharat Yosefbal hakalhot taharat ha-mish pahah: Rabi ʻOvadyah Yosef ... a[s.l]:b[s.n],c[n.d.]1 aYosef, Ovadia.0 bglx \ No newline at end of file diff --git a/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc b/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc new file mode 100644 index 00000000000..af937be2a1f --- /dev/null +++ b/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc @@ -0,0 +1 @@ +00764cam a2200241Ia 4500001000800000005001700008008004100025035002300066035001700089035001200106040001300118043001200131050002200143100001700165245010300182260004700285300002100332336002600353337002800379500001900407650007500426852002100501541517320221110033004.0050902s190u xx 000 0 eng d a(OCoLC)ocm61406084 a(NNC)5415173 a5415173 aZCUcZCU aa-tu--- 4aBV3170b.B65 19001 aBliss, E. E.10aIndirect results of missionary labor in northern Turkey /cby E.E. Bliss, D.D., of Constantinople. a[S.n.] :b[s.l.],c[between 1900 and 1909] a7 pages ;c24 cm atextbtxt2rdacontent aunmediatedbn2rdamedia aCaption title. 0aMissionszTurkey.0http://id.loc.gov/authorities/subjects/sh201010216980buts,mrlxxph1884 \ No newline at end of file diff --git a/openlibrary/catalog/marc/tests/test_parse.py b/openlibrary/catalog/marc/tests/test_parse.py index bd8e3a3fac6..5bdd7c66d53 100644 --- a/openlibrary/catalog/marc/tests/test_parse.py +++ b/openlibrary/catalog/marc/tests/test_parse.py @@ -76,6 +76,8 @@ '880_Nihon_no_chasho.mrc', '880_publisher_unlinked.mrc', '880_arabic_french_many_linkages.mrc', + 'test-publish-sn-sl.mrc', + 'test-publish-sn-sl-nd.mrc', ] test_data = "%s/test_data" % os.path.dirname(__file__) From 84feb808edc67dd2645e07a2d89ed3c310c719b7 Mon Sep 17 00:00:00 2001 From: Chales Horn Date: Mon, 10 Apr 2023 18:10:12 +1200 Subject: [PATCH 2/4] case insensitive test fix --- openlibrary/catalog/marc/parse.py | 8 ++++---- .../marc/tests/test_data/bin_input/test-publish-sn-sl.mrc | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/openlibrary/catalog/marc/parse.py b/openlibrary/catalog/marc/parse.py index 5c5e5ab727c..0b908bcc31a 100644 --- a/openlibrary/catalog/marc/parse.py +++ b/openlibrary/catalog/marc/parse.py @@ -327,7 +327,7 @@ def read_languages(rec: MarcBase, lang_008: Optional[str] = None) -> list[str]: def read_pub_date(rec: MarcBase) -> str | None: def publish_date(s: str) -> str: date = s.strip('[]') - if date == 'n.d.': # No date + if date.lower() == 'n.d.': # No date date = '[n.d.]' return remove_trailing_number_dot(date) @@ -337,14 +337,14 @@ def publish_date(s: str) -> str: def read_publisher(rec: MarcBase) -> dict[str, Any] | None: def publisher_name(s: str) -> str: - name = s.strip(' /,;:[') - if name == 's.n.': # Sine nomine + name = s.strip(' /,;:[]') + if name.lower() == 's.n.': # Sine nomine name = '[s.n.]' return name def publish_place(s: str) -> str: place = s.strip(' /.,;:[') - if place == 's.l.': # Sine loco + if place.lower().startswith('s.l.'): # Sine loco place = '[s.l.]' return place diff --git a/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc b/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc index af937be2a1f..afa032b178d 100644 --- a/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc +++ b/openlibrary/catalog/marc/tests/test_data/bin_input/test-publish-sn-sl.mrc @@ -1 +1 @@ -00764cam a2200241Ia 4500001000800000005001700008008004100025035002300066035001700089035001200106040001300118043001200131050002200143100001700165245010300182260004700285300002100332336002600353337002800379500001900407650007500426852002100501541517320221110033004.0050902s190u xx 000 0 eng d a(OCoLC)ocm61406084 a(NNC)5415173 a5415173 aZCUcZCU aa-tu--- 4aBV3170b.B65 19001 aBliss, E. E.10aIndirect results of missionary labor in northern Turkey /cby E.E. Bliss, D.D., of Constantinople. a[S.n.] :b[s.l.],c[between 1900 and 1909] a7 pages ;c24 cm atextbtxt2rdacontent aunmediatedbn2rdamedia aCaption title. 0aMissionszTurkey.0http://id.loc.gov/authorities/subjects/sh201010216980buts,mrlxxph1884 \ No newline at end of file +00764cam a2200241Ia 4500001000800000005001700008008004100025035002300066035001700089035001200106040001300118043001200131050002200143100001700165245010300182260004700285300002100332336002600353337002800379500001900407650007500426852002100501541517320221110033004.0050902s190u xx 000 0 eng d a(OCoLC)ocm61406084 a(NNC)5415173 a5415173 aZCUcZCU aa-tu--- 4aBV3170b.B65 19001 aBliss, E. E.10aIndirect results of missionary labor in northern Turkey /cby E.E. Bliss, D.D., of Constantinople. a[s.l.] :b[S.n.],c[between 1900 and 1909] a7 pages ;c24 cm atextbtxt2rdacontent aunmediatedbn2rdamedia aCaption title. 0aMissionszTurkey.0http://id.loc.gov/authorities/subjects/sh201010216980buts,mrlxxph1884 \ No newline at end of file From dd1ec4e3f5c46d1522cef45cd1f4cb94de43eea3 Mon Sep 17 00:00:00 2001 From: Chales Horn Date: Mon, 10 Apr 2023 18:37:44 +1200 Subject: [PATCH 3/4] no date tends to be blank rather than [n.d.] --- openlibrary/catalog/marc/parse.py | 4 ++-- .../tests/test_data/bin_expect/test-publish-sn-sl-nd.json | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/openlibrary/catalog/marc/parse.py b/openlibrary/catalog/marc/parse.py index 0b908bcc31a..6ab14b0205d 100644 --- a/openlibrary/catalog/marc/parse.py +++ b/openlibrary/catalog/marc/parse.py @@ -338,13 +338,13 @@ def publish_date(s: str) -> str: def read_publisher(rec: MarcBase) -> dict[str, Any] | None: def publisher_name(s: str) -> str: name = s.strip(' /,;:[]') - if name.lower() == 's.n.': # Sine nomine + if name.lower().startswith('s.n'): # Sine nomine name = '[s.n.]' return name def publish_place(s: str) -> str: place = s.strip(' /.,;:[') - if place.lower().startswith('s.l.'): # Sine loco + if place.lower().startswith('s.l'): # Sine loco place = '[s.l.]' return place diff --git a/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json index a3fd054e372..9c84f866c69 100644 --- a/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json +++ b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl-nd.json @@ -12,7 +12,6 @@ "personal_name": "Yosef, Ovadia" } ], - "date": "[n.d.]", "publishers": [ "[s.n.]" ], From 3f5bc04564496166442ab2c5f53d2a58acb50e17 Mon Sep 17 00:00:00 2001 From: Chales Horn Date: Tue, 11 Apr 2023 11:36:18 +1200 Subject: [PATCH 4/4] allow uncertain dates, e.g. 190u --- openlibrary/catalog/marc/parse.py | 3 ++- .../marc/tests/test_data/bin_expect/test-publish-sn-sl.json | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/openlibrary/catalog/marc/parse.py b/openlibrary/catalog/marc/parse.py index 6ab14b0205d..b6414b129f6 100644 --- a/openlibrary/catalog/marc/parse.py +++ b/openlibrary/catalog/marc/parse.py @@ -19,6 +19,7 @@ DNB_AGENCY_CODE = 'DE-101' max_number_of_pages = 50000 # no monograph should be longer than 50,000 pages re_bad_char = re.compile('\ufffd') +re_date = re.compile(r'^[0-9]+u*$') re_question = re.compile(r'^\?+$') re_lccn = re.compile(r'([ \dA-Za-z\-]{3}[\d/-]+).*') re_oclc = re.compile(r'^\(OCoLC\).*?0*(\d+)') @@ -664,7 +665,7 @@ def read_edition(rec: MarcBase) -> dict[str, Any]: raise BadMARC("'008' field must not be blank") publish_date = f[7:11] - if publish_date.isdigit() and publish_date != '0000': + if re_date.match(publish_date) and publish_date != '0000': edition["publish_date"] = publish_date if f[6] == 't': edition["copyright_date"] = f[11:15] diff --git a/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json index 68c0875a7e0..e228d0db06a 100644 --- a/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json +++ b/openlibrary/catalog/marc/tests/test_data/bin_expect/test-publish-sn-sl.json @@ -25,6 +25,7 @@ "subjects": [ "Missions" ], + "publish_date": "190u", "publishers": [ "[s.n.]" ],