diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5af28bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pkl +*.jsonl diff --git a/README.md b/README.md index 6ebd8bc..601c82d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Change into the `nwbib` directory: cd nwbib -Load NWBib data from the Lobid API: +Load sample NWBib data from the Lobid API: python3 nwbib_subjects_load.py @@ -26,6 +26,10 @@ Run classification experiment: python3 nwbib_subjects_process.py +Run bulk classification (first run takes some time): + + python3 nwbib_subjects_bulk.py + ## License [Eclipse Public License 2.0](http://www.eclipse.org/legal/epl-v20.html) \ No newline at end of file diff --git a/nwbib/nwbib-subjects-bulk-predict.csv b/nwbib/nwbib-subjects-bulk-predict.csv new file mode 100644 index 0000000..fead71c --- /dev/null +++ b/nwbib/nwbib-subjects-bulk-predict.csv @@ -0,0 +1,299 @@ +hbzId,subject +HT019197733,s733030 +HT012856855,s503430 +BT000003431,s841070 +HT016712778,s768030 +BT000004673,s841070 +HT017700139,s843090 +BT000004675,s783000 +BT000004695,s767040 +BT000004706,s768010 +HT016444185,s799200 +BT000004925,s582050 +HT017683803,s555020 +HT019178040,s844030 +BT000002444,s797010 +BT000003102,s841070 +BT000003964,s797010 +BT000004262,s841070 +BT000004268,s240000 +BT000004863,s841070 +BT000006046,s797010 +HT016708916,s425020 +BT000004974,s611050 +BT000004986,s768010 +BT000004989,s240000 +BT000005009,s841070 +BT000005048,s841070 +BT000005057,s406130 +BT000002448,s768030 +HT017404363,s841070 +BT000002480,s797010 +BT000003147,s827000 +BT000003528,s841070 +BT000003980,s736030 +BT000004555,s824040 +HT016713438,s744000 +BT000004571,s768010 +BT000004575,s768010 +BT000004580,s841070 +BT000004737,s824040 +HT016708934,s425000 +BT000006049,s797010 +BT000005076,s102000 +BT000005099,s768030 +HT015605337,s228000 +BT000002515,s794010 +BT000002517,s732000 +BT000003180,s804000 +BT000004298,s555000 +BT000004577,s768010 +HT002142428,s841070 +HT017049016,s613050 +HT017049114,s768010 +HT018940690,s109000 +BT000002537,s798200 +BT000003187,s163050 +BT000003780,s554000 +HT014090049,s557000 +BT000004597,s102000 +BT000004605,s844030 +BT000004608,s841070 +BT000004780,s841070 +HT019135849,s220500 +BT000005162,s102000 +BT000005191,s102000 +BT000005203,s768030 +BT000005228,s228000 +BT000002567,s613050 +HT017390012,s240000 +BT000002590,s240000 +BT000004001,s841070 +BT000004024,s611090 +BT000004025,s611090 +BT000004658,s768010 +BT000004655,s768010 +BT000004721,s768010 +BT000006213,s768010 +BT000006219,s844030 +HT017049047,s841070 +BT000006389,s768030 +BT000006583,s841070 +HT009125364,s841070 +HT017012857,s102000 +BT000002759,s797010 +BT000002752,s543800 +BT000003244,s843042 +BT000004333,s126000 +BT000004345,s240000 +BT000004684,s768010 +BT000006489,s442000 +BT000005167,s768010 +BT000005171,s747010 +BT000006587,s882024 +BT000006590,s802060 +BT000006594,s841070 +BT000006623,s225000 +BT000006625,s613090 +BT000002775,s768030 +BT000002792,s841070 +HT015287390,s222000 +HT017698791,s102000 +BT000003548,s767040 +BT000003822,s768010 +HT015287495,s611010 +HT015287558,s841040 +HT015287597,s768030 +HT015287623,s768010 +HT015287626,s613010 +HT017286817,s613010 +BT000006522,s826000 +BT000005184,s740100 +BT000002823,s543000 +BT000002826,s228000 +BT000002829,s844030 +BT000002833,s797010 +BT000003289,s768010 +BT000003293,s613050 +BT000003294,s744000 +HT018749389,s841070 +HT017049848,s796000 +BT000006413,s841070 +BT000006486,s768010 +BT000006565,s226000 +BT000005339,s768030 +HT017421266,s704063 +BT000003305,s768030 +HT017015640,s797010 +HT017049481,s788000 +BT000006595,s823000 +BT000005369,s163080 +HT013987261,s544610 +HT017012889,s240000 +BT000005532,s841070 +BT000005800,s844050 +BT000005802,s846000 +BT000005803,s844030 +BT000005804,s844050 +BT000005819,s240000 +BT000003643,s217030 +BT000003852,s768010 +BT000004417,s768010 +BT000004424,s768010 +BT000005415,s102000 +HT017013602,s844200 +BT000005423,s102000 +HT017051937,s706042 +BT000003856,s768010 +BT000003874,s797010 +BT000003895,s532050 +BT000003898,s844200 +BT000004124,s841070 +BT000006574,s841070 +BT000006575,s762000 +BT000006640,s841070 +BT000002932,s613040 +BT000002952,s797010 +BT000003872,s797010 +BT000005726,s142380 +HT015287603,s844030 +HT015287607,s613050 +HT015287610,s613050 +BT000005912,s844200 +HT014135282,s142320 +BT000006004,s841070 +HT014092716,s841070 +BT000002980,s846000 +BT000003381,s768010 +BT000003382,s768010 +BT000003384,s768010 +BT000003876,s797010 +HT014129983,s613000 +BT000004467,s797010 +BT000004477,s797010 +HT013989184,s844200 +BT000006612,s109000 +BT000006653,s841040 +BT000005649,s613010 +BT000005892,s768030 +BT000006118,s841070 +HT017050088,s706090 +BT000003688,s843060 +BT000003691,s843046 +BT000003716,s406130 +BT000003720,s841070 +BT000004217,s586020 +HT014092622,s841070 +BT000005982,s768010 +BT000006110,s572050 +BT000006116,s820100 +BT000006170,s547460 +BT000003028,s768030 +BT000003031,s841070 +BT000003037,s724040 +HT013988338,s768030 +BT000003066,s611050 +BT000004224,s844030 +TT002621880,s841070 +BT000005519,s700100 +BT000005526,s768010 +BT000005699,s217000 +TT002623120,s841070 +BT000005779,s240000 +BT000005799,s844030 +HT018186727,s503260 +BT000006018,s557020 +HT017017675,s841070 +HT014132392,s217000 +HT017016033,s708060 +HT013988363,s768030 +HT012774080,s428030 +HT014134939,s768010 +HT012819603,s102000 +HT004170735,s708029 +HT013988397,s768030 +HT007382423,s844030 +HT017685972,s788000 +HT012773930,s240000 +HT015612888,s768010 +HT013989541,s102070 +HT019130151,s220500 +HT019036954,s205040 +HT007599850,s142380 +HT013989390,s543800 +HT012823761,s841070 +HT014131829,s841070 +HT019178300,s226030 +HT013989670,s841070 +HT017424893,s572050 +HT017525587,s102000 +HT013989567,s220500 +HT013990956,s768010 +HT014130082,s228000 +HT012821242,s240000 +HT018992763,s843000 +HT014715542,s225000 +HT017697493,s844200 +HT003470621,s240000 +HT012776653,s744000 +HT018933209,s700100 +HT007958739,s841070 +HT018286871,s768010 +HT017306644,s217000 +HT016001979,s611060 +HT014904955,s768010 +HT013150397,s841070 +HT019025915,s240000 +HT019493573,s228000 +HT016002174,s611060 +HT018791004,s225000 +CT002011301,s797010 +HT016997682,s800100 +HT016002021,s611060 +HT016995047,s102000 +HT018189291,s735020 +HT018942954,s841060 +HT019105829,s102000 +HT016006770,s844040 +TT002926919,s841070 +HT013151844,s425020 +HT013554686,s226030 +HT018788766,s562000 +HT014404763,s841070 +HT016456752,s768030 +HT014903161,s768010 +HT013152404,s841070 +HT016009664,s613043 +HT015381800,s213000 +HT019354900,s844200 +HT017054850,s768010 +HT014402043,s613010 +HT010848782,s768010 +HT013552435,s841070 +HT014992451,s224060 +HT014992452,s225000 +HT014992454,s221000 +HT018584327,s841070 +HT016141485,s841070 +HT013652175,s841070 +HT017056757,s532030 +HT013555584,s566070 +HT013201188,s544210 +HT016001648,s844200 +HT016002115,s220500 +HT016458705,s849056 +HT018183132,s736000 +HT016310431,s841070 +HT013652497,s102060 +HT017060772,s524010 +HT018966175,s240000 +HT014995656,s425020 +HT015383632,s843060 +HT014995854,s841070 +HT013151784,s768030 +HT019423240,s632050 +HT013554645,s262012 +HT003295274,s557000 +HT003295284,s557020 +HT016313237,s768010 +HT011094158,s702000 diff --git a/nwbib/nwbib_subjects_bulk.py b/nwbib/nwbib_subjects_bulk.py new file mode 100644 index 0000000..58bb17d --- /dev/null +++ b/nwbib/nwbib_subjects_bulk.py @@ -0,0 +1,143 @@ +''' +Created on Feb 16, 2018 + +@author: fsteeg +''' + +import json +import csv +import pickle +import requests +from pathlib import Path +from sklearn.feature_extraction.text import HashingVectorizer +from sklearn.svm import LinearSVC +from sklearn.metrics.classification import accuracy_score + + +bulk_file = 'nwbib-subjects-bulk.jsonl' +url = 'http://lobid.org/resources/search' +params = { + 'q': 'rheinland', # for testing: use small-ish set + 'nested': 'subject:subject.source.id:"http://purl.org/lobid/nwbib"', + 'format': 'bulk' +} +saved_classifier_file = 'nwbib-subjects-classifier.pkl' +stop_word_url = 'https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_plain.txt' +classifier = LinearSVC() + +output_file = 'nwbib-subjects-bulk-predict.csv' + + +def main(): + + vectorizer = HashingVectorizer(n_features=2 ** 18, stop_words=stop_words()) + + create_bulk_data() + hbzIds, subjects, texts = load_from_jsonl(bulk_file) + + test_set_size = len(subjects) // 100 + Y_train, X_train_texts = subjects[test_set_size:], texts[test_set_size:] + ids_test, Y_test, X_test_texts = \ + hbzIds[:test_set_size], subjects[:test_set_size], texts[:test_set_size] + + print('{} training docs, {} testing docs'.format(len(Y_train), len(Y_test))) + print_info(Y_train[0], X_train_texts[0], vectorizer) + + X_train = vectorizer.transform(X_train_texts) + X_test = vectorizer.transform(X_test_texts) + + classifier = create_classifier(X_train, Y_train) + + prediction, _score = predict(X_test, Y_test, classifier) + + data = [(hbzId, prediction[i]) for (i, hbzId) in enumerate(ids_test)] + write_to_csv(output_file, data) + + +def stop_words(): + response = requests.get( + url=stop_word_url) + return [line.strip() for line in response.text.splitlines() + if not line.startswith(';')] + + +def create_bulk_data(): + if not Path(bulk_file).exists(): + print('Getting bulk data...') + response = requests.get(url=url, params=params) + with open(bulk_file, 'w') as f: + f.write(response.text) + else: + print('Using local bulk data in {}...'.format(bulk_file)) + + +def create_classifier(X_train, Y_train): + result = None + if Path(saved_classifier_file).exists(): + print('Loading trained classifier...') + with open(saved_classifier_file, 'rb') as c: + result = pickle.load(c) + else: + print('Training classifier...') + result = classifier.fit(X_train, Y_train) + with open(saved_classifier_file, 'wb') as c: + pickle.dump(result, c) + return result + + +def print_info(subject, text, vectorizer): + # See http://scikit-learn.org/stable/modules/feature_extraction.html + print('Using vectorizer: {}'.format(vectorizer)) + analyzer = vectorizer.build_analyzer() + vector = vectorizer.transform([text])[0] + print('{}, {}'.format(subject, analyzer(text))) + print(vector) + + +def predict(X_test, Y_test, classifier): + print('Predicting...') + Y_pred = classifier.predict(X_test) + score = accuracy_score(Y_test, Y_pred) + print('{:1.4f} classification accuracy for {}'.format( + score, classifier)) + return Y_pred, score + + +def load_from_jsonl(jsonl): + with open(jsonl, "r") as f: + hbzIds = [] + subjects = [] + texts = [] + for line in f.readlines(): + entry = json.loads(line) + hbzId = entry['hbzId'] + subject = first_nwbib_subject(entry) + title = entry.get('title', '') + sub = entry.get('otherTitleInformation', None) + corp = entry.get('corporateBodyForTitle', None) + vals = [title, sub[0] if sub else '', corp[0] if corp else ''] + doc = ' '.join(vals).strip() + hbzIds.append(hbzId) + subjects.append(subject) + texts.append(doc) + return (hbzIds, subjects, texts) + + +def write_to_csv(name, data): + with open(name, 'w', newline='') as csvfile: + writer = csv.writer( + csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + writer.writerow(['hbzId', 'subject']) + for (hbzId, subject) in data: + writer.writerow([hbzId, subject]) + + +def first_nwbib_subject(entry): + for subject in entry.get('subject', []): + source = subject.get('source', None) + if source and source.get('id', None) == 'http://purl.org/lobid/nwbib': + return subject['id'].split('#')[1] + return 'NULL' + + +main()