From bbf3cd762108cf061385606130de3299bfdd2016 Mon Sep 17 00:00:00 2001 From: Ale Abdo Date: Fri, 2 Apr 2021 03:53:33 +0200 Subject: [PATCH] More improvements to ETL and comparing BIBLIO and PLOSC --- code/data_etl.ipynb | 664 +++++++++++------------------------- code/project_definitions.py | 4 + 2 files changed, 206 insertions(+), 462 deletions(-) diff --git a/code/data_etl.ipynb b/code/data_etl.ipynb index 29ff681..107af96 100644 --- a/code/data_etl.ipynb +++ b/code/data_etl.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 195, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -20,6 +20,8 @@ "import re\n", "import numpy as np\n", "import pandas as pd\n", + "import unicodedata\n", + "import string\n", "import rispy\n", "import matplotlib.pyplot as plt\n", "from pathlib import Path\n", @@ -40,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +59,9 @@ " 'transforms': [],\n", "}\n", "wosSource = {\n", - " 'paths': [dataSourceDir / x for x in (\"wos1-500.ciw\", \"wos501-973.ciw\")],\n", + " 'paths': [\n", + " dataSourceDir / x for x in \"wos1001-1500.ciw wos1-500.ciw wos1501-1689.ciw wos501-1000.ciw\".split()\n", + " ],\n", " 'rispy_args': {'implementation': 'wok'},\n", " 'col_rename': {'publication_year': 'year', 'document_title': 'title'},\n", " 'transforms': [],\n", @@ -66,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -86,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -95,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -129,6 +133,65 @@ "allData = pd.concat(allDataList, join='outer', ignore_index=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Keep only article data\n", + "article_data = allData.loc[allData[\"type_of_reference\"].eq('JOUR') | allData[\"publication_type\"].eq('J')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize DOI\n", + "article_data.loc[:, 'doi'] = article_data['doi'].str.translate(\n", + " str.maketrans(string.ascii_lowercase, string.ascii_uppercase)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove spurious records\n", + "article_data = article_data.loc[article_data['url'].ne(\n", + " \"https://www.scopus.com/inward/record.uri?eid=2-s2.0-85052219975&partnerID=40&md5=7b54756675a6d510c9db069b49b634d6\"\n", + ")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Correct faulty records\n", + "data_corrections = {\n", + " 'doi': {\n", + " r'^(.*)/PDF$': r'\\1',\n", + " }\n", + "}\n", + "corrected_article_data = article_data.replace(data_corrections, regex=True)\n", + "article_data.compare(corrected_article_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "article_data = corrected_article_data" + ] + }, { "cell_type": "code", "execution_count": null, @@ -137,7 +200,7 @@ }, "outputs": [], "source": [ - "allData.describe()" + "article_data.describe()" ] }, { @@ -151,7 +214,11 @@ " return np.nan\n", " if sx.name == '__source':\n", " return sx.sum()\n", - " return sx[sx.map(len, na_action='ignore').idxmax()]\n", + " if sx.name == 'doi':\n", + " if len(sx.dropna().unique()) > 1:\n", + " print('Warning, merging different DOIs:\\n', sx)\n", + " return list(sx.dropna().unique())\n", + " return sx[sx.map(len, na_action='ignore').idxmax()] # Keep a list of all DOIs - must explode before using!\n", "\n", "def merge_records_keep_longest(dfx):\n", " return dfx.agg(merge_series_keep_longest)" @@ -163,10 +230,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Keep only article data\n", - "article_data = allData.loc[allData[\"type_of_reference\"].eq('JOUR') | allData[\"publication_type\"].eq('J')]\n", "# Merge data with same DOI\n", "article_doi = article_data.groupby(article_data['doi'].values).agg(merge_records_keep_longest)\n", + "\n", "# Reassemble data with and without DOI\n", "article_nodoi = article_data[~article_data.doi.isin(article_doi.index)]\n", "article_data = pd.concat([article_doi, article_nodoi], ignore_index=True)" @@ -174,7 +240,18 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_diacritics(input_str):\n", + " nfkd_form = unicodedata.normalize('NFKD', input_str)\n", + " return \"\".join([c for c in nfkd_form if not unicodedata.combining(c)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -185,12 +262,13 @@ " .str.replace(r'[^\\s\\w]', ' ', regex=True)\n", " .str.replace(r'\\s+', ' ', regex=True)\n", " .str.strip()\n", + " # .map(remove_diacritics) # no need as our corpus is in English\n", " )" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -235,7 +313,7 @@ "metadata": {}, "outputs": [], "source": [ - "aa = articles_g.agg(list)[articles_g.size()>=2]" + "aa = articles_g.agg(list)[articles_g.size() > 1]" ] }, { @@ -245,10 +323,13 @@ "outputs": [], "source": [ "# Test alternatives matchers\n", - "# articles_gx = article_data.groupby(Match(article_data, 15).match)\n", - "# bb = articles_gx.agg(list)[articles_gx.size()>=2]\n", - "# set(clean_titles(aa.explode('title')['title'])).difference(clean_title(bb.explode('title')['title']))\n", - "# set(clean_titles(bb.explode('title')['title'])).difference(clean_title(aa.explode('title')['title']))" + "if False:\n", + " articles_gx = article_data.groupby(Match(article_data, 15).match)\n", + " bb = articles_gx.agg(list)[articles_gx.size() > 1]\n", + " pprint([sorted(x) for x in (\n", + " set(clean_titles(aa.explode('title')['title'])).difference(clean_titles(bb.explode('title')['title'])),\n", + " set(clean_titles(bb.explode('title')['title'])).difference(clean_titles(aa.explode('title')['title'])),\n", + " )])" ] }, { @@ -257,102 +338,53 @@ "metadata": {}, "outputs": [], "source": [ - "# Check that matching titles also have matching year and author (impl: first author last name)\n", - "assert aa['year'].map(lambda x: len(set(x)) < 2).all()\n", - "aa['authors'].map(\n", - " lambda x: set(\n", - " tuple(z.split(',')[0].split(' ')[-1] for z in y) # last name of each author\n", - " for y in x\n", - " if not ( isinstance(y, np.float) and pd.isna(y) ) # skip NANs\n", + "def clean_name(name):\n", + " return remove_diacritics(name.split(',')[0].split(' ')[-1].lower().replace(' ', '').replace('-', ''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check that matching titles also have matching year\n", + "sel = aa['year'].map(lambda x: len(set(x)) > 1)\n", + "aa[sel]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check that matching titles also have matching author (impl: first author last name)\n", + "sel = aa['authors'].map(\n", + " lambda merged_authors: set(\n", + " tuple( # last name of each author\n", + " clean_name(author)\n", + " for author in authors\n", + " )\n", + " for authors in merged_authors\n", + " if not ( isinstance(authors, float) and pd.isna(authors) ) # skip NANs\n", " )\n", ").map(\n", - " lambda x: sum(\n", - " edit_distance(y, z) # sum the edit distances\n", - " for x in list(zip(*x))[:1] # first authors\n", - " for i, y in enumerate(x) for z in x[i+1:] # distinct pairs\n", + " lambda merged_lastnames: sum(\n", + " edit_distance(firstauthor, other_firstauthor) # sum the edit distances\n", + " for merged_firstauthor in list(zip(*merged_lastnames))[:1] # first authors\n", + " for i, firstauthor in enumerate(merged_firstauthor)\n", + " for other_firstauthor in merged_firstauthor[i+1:] # distinct pairs\n", " )\n", - ").max() < 2" + ") > 0\n", + "aa[sel].authors.to_dict()" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
doititleauthors
count623706702
unique623706680
top10.1016/j.ohx.2020.e00127Research on Monitoring Platform of Agricultura...[Pearce, J.M.]
freq1110
\n", - "
" - ], - "text/plain": [ - " doi \\\n", - "count 623 \n", - "unique 623 \n", - "top 10.1016/j.ohx.2020.e00127 \n", - "freq 1 \n", - "\n", - " title authors \n", - "count 706 702 \n", - "unique 706 680 \n", - "top Research on Monitoring Platform of Agricultura... [Pearce, J.M.] \n", - "freq 1 10 " - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "article_data[['doi', 'title', 'authors']].describe()" ] @@ -384,101 +416,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Load article data (instead of running the code above)" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [], - "source": [ - "data_corrections = {\n", - " 'doi': {\n", - " r'^(.*)/pdf$': r'\\1',\n", - "# r'^(.*)/\\w+/$': r'\\1',\n", - " }\n", - "}" + "# Load article data (if already stored from the code above)" ] }, { "cell_type": "code", - "execution_count": 100, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "article_data = load_data(articleDataFile)" ] }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
doi
selfother
24510.1088/2058-7058/31/8/34/pdf10.1088/2058-7058/31/8/34
\n", - "
" - ], - "text/plain": [ - " doi \n", - " self other\n", - "245 10.1088/2058-7058/31/8/34/pdf 10.1088/2058-7058/31/8/34" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rep_article_data = article_data.replace(data_corrections, regex=True)\n", - "article_data.compare(rep_article_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "article_data = rep_article_data" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -488,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -497,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -521,23 +470,27 @@ "metadata": {}, "outputs": [], "source": [ - "assert plosData[\"URI (DOI or URL)\"].notna().all()" + "assert plosData[\"URI (DOI or URL)\"].notna().all()\n", + "# Normalize DOI\n", + "plosData[\"URI (DOI or URL)\"] = plosData[\"URI (DOI or URL)\"].str.translate(\n", + " str.maketrans(string.ascii_lowercase, string.ascii_uppercase)\n", + ")" ] }, { "cell_type": "code", - "execution_count": 171, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the doi and doi-like, fixing doi-like containing extra stuff\n", "re_doi = r\"(10\\.[1-9]\\d{3,}(?:\\.\\d+)*/.+)\"\n", - "re_http_doi_fix = r\"https?://.*/\" + re_doi + r\"(?:/|/full|/abstract|#\\w+)$\"" + "re_http_doi_fix = r\"HTTPS?://.*/\" + re_doi + r\"(?:/|/FULL|/ABSTRACT|#\\w+)$\"" ] }, { "cell_type": "code", - "execution_count": 172, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -546,122 +499,29 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plosData_doi_http_doi_fixed = (\n", " plosData['URI (DOI or URL)']\n", - " .str.extract(re_httpdoi)[0]\n", + " .str.extract(re_http_doi_fix)[0]\n", " .map(unquote, na_action='ignore')\n", ")" ] }, { "cell_type": "code", - "execution_count": 174, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
selfother
3510.5334/joh.7/10.5334/joh.7
3610.5334/joh.4/10.5334/joh.4
9610.3389/fnbeh.2019.00140/full10.3389/fnbeh.2019.00140
9810.3389/fncir.2012.00098/full10.3389/fncir.2012.00098
9910.3389/fneng.2014.00043/full10.3389/fneng.2014.00043
10310.3389/fnins.2019.00784/full10.3389/fnins.2019.00784
12610.1088/1741-2552/aa6806#jneaa6806f0110.1088/1741-2552/aa6806
12810.5334/joh.14/10.5334/joh.14
13410.3389/fphys.2019.00099/abstract10.3389/fphys.2019.00099
\n", - "
" - ], - "text/plain": [ - " self other\n", - "35 10.5334/joh.7/ 10.5334/joh.7\n", - "36 10.5334/joh.4/ 10.5334/joh.4\n", - "96 10.3389/fnbeh.2019.00140/full 10.3389/fnbeh.2019.00140\n", - "98 10.3389/fncir.2012.00098/full 10.3389/fncir.2012.00098\n", - "99 10.3389/fneng.2014.00043/full 10.3389/fneng.2014.00043\n", - "103 10.3389/fnins.2019.00784/full 10.3389/fnins.2019.00784\n", - "126 10.1088/1741-2552/aa6806#jneaa6806f01 10.1088/1741-2552/aa6806\n", - "128 10.5334/joh.14/ 10.5334/joh.14\n", - "134 10.3389/fphys.2019.00099/abstract 10.3389/fphys.2019.00099" - ] - }, - "execution_count": 174, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "plosData_doi.loc[plosData_doi_http_doi_fixed.notna()].compare(plosData_doi_http_doi_fixed.dropna())" ] }, { "cell_type": "code", - "execution_count": 175, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -671,52 +531,22 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4 10.1002/elps.201800304\n", - "35 10.5334/joh.7\n", - "36 10.5334/joh.4\n", - "65 10.1021/acs.analchem.9b02628\n", - "66 10.1063/1.4941068\n", - " ... \n", - "251 10.1371/journal.pone.0011890\n", - "317 10.1371/journal.pone.0214460\n", - "319 10.1371/journal.pone.0192752\n", - "330 10.1016/j.techfore.2020.119986\n", - "331 10.1111/tra.12728\n", - "Name: doi, Length: 126, dtype: object" - ] - }, - "execution_count": 176, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "plosData['doi'].dropna()" ] }, { "cell_type": "code", - "execution_count": 193, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "19 712\n" - ] - } - ], + "outputs": [], "source": [ "print(\n", - " len(set(plosData['doi'].dropna()).intersection(article_data['doi'])),\n", - " len(set(plosData['doi'].dropna()).symmetric_difference(article_data['doi'])),\n", + " len(set(plosData['doi'].dropna()).intersection(article_data['doi'].explode())),\n", + " len(set(plosData['doi'].dropna()).symmetric_difference(article_data['doi'].explode())),\n", ")" ] }, @@ -729,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 224, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -738,43 +568,21 @@ }, { "cell_type": "code", - "execution_count": 197, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "36" - ] - }, - "execution_count": 197, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# How many from the collection have their title in article_data\n", "plosData['Title (URL items only)'].pipe(clean_titles).map(\n", - " lambda x: article_data.title.pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()\n", + " lambda x: article_data['title'].pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()\n", ").sum()" ] }, { "cell_type": "code", - "execution_count": 198, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "20" - ] - }, - "execution_count": 198, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# How many from the collection have their title in article_data if we require they have DOIs\n", "plosData['Title (URL items only)'].loc[plosData['doi'].notna()].pipe(clean_titles).map(\n", @@ -784,27 +592,9 @@ }, { "cell_type": "code", - "execution_count": 210, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "219 10.1371/journal.pone.0168207\n", - "117 10.1016/j.ohx.2017.07.001\n", - "203 10.1371/journal.pone.0181560\n", - "231 10.1371/journal.pone.0134989\n", - "190 10.1371/journal.pone.0201353\n", - "65 10.1021/acs.analchem.9b02628\n", - "181 10.1371/journal.pone.0220091\n", - "232 10.1371/journal.pone.0124938\n", - "182 10.1371/journal.pone.0228140\n", - "210 10.1371/journal.pone.0178540\n", - "Name: doi, dtype: object\n" - ] - } - ], + "outputs": [], "source": [ "# Give me 10 from the collection having DOIs\n", "z = plosData['doi'].dropna().sample(10)\n", @@ -813,23 +603,9 @@ }, { "cell_type": "code", - "execution_count": 212, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "219 chaos based simultaneous compression and encryption for hadoop\n", - "203 feasibility of a 3d printed anthropomorphic patient specific head phantom for patient specific quality assurance of intensity modulated radiotherapy\n", - "65 odx a fitness tracker based device for continuous bacterial growth monitoring\n", - "181 a low cost fluorescence reader for in vitro transcription and nucleic acid detection with cas13a\n", - "232 multi contrast imaging and digital refocusing on a mobile microscope with a domed led array\n", - "182 fieldwork based determination of design priorities for point of use drinking water quality sensors for use in resource limited environments\n", - "210 from medical imaging data to 3d printed anatomical models\n" - ] - } - ], + "outputs": [], "source": [ "# Get their titles if their titles are not in article_data\n", "for i, title in plosData.loc[z.index]['Title (URL items only)'].pipe(clean_titles).items():\n", @@ -839,42 +615,20 @@ }, { "cell_type": "code", - "execution_count": 246, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "107" - ] - }, - "execution_count": 246, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Selector for DOIs only in the collection\n", - "sel_new_doi = ~plosData[\"doi\"].dropna().isin(article_data.doi.values)\n", + "sel_new_doi = ~plosData[\"doi\"].dropna().isin(article_data['doi'].explode().values)\n", "sel_new_doi.sum()" ] }, { "cell_type": "code", - "execution_count": 263, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "136" - ] - }, - "execution_count": 263, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Selector for Titles only in the collection\n", "sel_new_title = ~clean_titles(plosData[\"Title (URL items only)\"]).isin(clean_titles(article_data['title']))\n", @@ -883,17 +637,9 @@ }, { "cell_type": "code", - "execution_count": 268, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "bottom-illuminated orbital shaker for microalgae cultivation 10.1016/j.ohx.2020.e00143 10.1101/2020.05.01.071878\n" - ] - } - ], + "outputs": [], "source": [ "# Same title, different DOIs\n", "x = plosData[[\"doi\", \"Title (URL items only)\"]].loc[sel_new_doi & ~sel_new_title]\n", @@ -910,7 +656,7 @@ }, { "cell_type": "code", - "execution_count": 269, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -918,8 +664,8 @@ "x = plosData.loc[~sel_new_doi & sel_new_title, 'doi']\n", "for y in x:\n", " print(\n", - " plosData.loc[plosData.doi.eq(y), \"Title (URL items only)\"].squeeze(),\n", - " article_data.loc[article_data.doi.eq(y), 'title'].squeeze(),\n", + " plosData.loc[plosData['doi'].eq(y), \"Title (URL items only)\"].squeeze(),\n", + " article_data.loc[article_data['doi'].explode().eq(y), 'title'].squeeze(),\n", " )" ] }, @@ -936,7 +682,7 @@ "metadata": {}, "outputs": [], "source": [ - "data = nad" + "article_data.shape" ] }, { @@ -945,8 +691,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(data.shape)\n", - "print(data.columns)" + "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts()" ] }, { @@ -955,7 +700,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(article_data.shape)" + "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts().reset_index().plot(loglog=True)" ] }, { @@ -964,25 +709,14 @@ "metadata": {}, "outputs": [], "source": [ - "dup_title = article_data.duplicated('title', keep=False)\n", - "dup_doi = article_data.duplicated('doi', keep=False)\n", - "nan_doi = article_data['doi'].isna()\n", - "print(\n", - " dup_title.sum(),\n", - " dup_doi.sum(),\n", - " nan_doi.sum(),\n", - " (dup_title & dup_doi).sum(),\n", - " (dup_title & ~dup_doi).sum(),\n", - ")" + "article_data.groupby('year').size().plot.bar()" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts()" + "## Play with our 10 article sample" ] }, { @@ -991,7 +725,20 @@ "metadata": {}, "outputs": [], "source": [ - "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts().reset_index().plot(loglog=True)" + "dois = pd.Series(\"\"\"\n", + " 10.1371/journal.pone.0187219\n", + " 10.1371/journal.pone.0059840\n", + " 10.1371/journal.pone.0030837\n", + " 10.1371/journal.pone.0118545\n", + " 10.1371/journal.pone.0206678\n", + " 10.1371/journal.pone.0143547\n", + " 10.1371/journal.pone.0220751\n", + " 10.1371/journal.pone.0107216\n", + " 10.1371/journal.pone.0226761\n", + " 10.1371/journal.pone.0193744\n", + "\"\"\".split()).str.translate(\n", + " str.maketrans(string.ascii_lowercase, string.ascii_uppercase)\n", + ")" ] }, { @@ -1000,15 +747,8 @@ "metadata": {}, "outputs": [], "source": [ - "article_data.groupby('year').size().plot.bar()" + "dois[dois.isin(article_data.doi.explode())]" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/code/project_definitions.py b/code/project_definitions.py index eedc0b8..2498649 100644 --- a/code/project_definitions.py +++ b/code/project_definitions.py @@ -35,13 +35,17 @@ def build_query(): adjectives = [ 'open', 'open source', + 'opensource', 'open science', 'frugal', + #'do it yourself' + #'diy' #'low cost', ] phrases = [ ' '.join([a, n]) for a in adjectives for n in nouns ] + phrases.remove('open design') phrases.extend([ "free hardware and software", ])