Skip to content

Commit

Permalink
Update scripts style autoPEP8
Browse files Browse the repository at this point in the history
  • Loading branch information
northword committed Aug 19, 2023
1 parent 1eabdb7 commit 2fc8000
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 46 deletions.
8 changes: 5 additions & 3 deletions scripts/check_ampersands.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
if ('\&' in line):
errFileNames.append(file)
errRows.append(i + 1)
errCols.append([index + 1 for index in range(len(line)) if line.startswith('\&', index)])
errCols.append(
[index + 1 for index in range(len(line)) if line.startswith('\&', index)])


# In the case where we do find escaped &, the len() will be non-zero
Expand All @@ -43,8 +44,9 @@
# For each file, append every row:col location to the error message
for i, fname in enumerate(errFileNames):
for col in errCols[i]:
err_msg += "("+ fname + ", " + str(errRows[i]) + ":" + str(col) + "), "
err_msg += "(" + fname + ", " + \
str(errRows[i]) + ":" + str(col) + "), "
# Format end of string and return as Value Error to 'fail' GitHub Actions process
err_msg = err_msg[:len(err_msg) - 2]
err_msg += "]"
raise ValueError("Found Escaped Ampersands at: " + err_msg)
raise ValueError("Found Escaped Ampersands at: " + err_msg)
5 changes: 3 additions & 2 deletions scripts/combine_journal_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
import sys
import pandas as pd


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
dfs = []
for file in sys.argv[2:]:
for file in sys.argv[2:]:
df = pd.read_csv(file, header=None)
dfs.append(df)
print(f"{file}: {len(df)}")
Expand All @@ -38,5 +39,5 @@ def main(output_filename):
filename = sys.argv[1]
else:
filename = "journalList.csv"

main(filename)
9 changes: 5 additions & 4 deletions scripts/combine_journal_lists_dotless.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@

# Define the list of CSV files
import_order = [
'journals/journal_abbreviations_entrez.csv',
'journals/journal_abbreviations_medicus.csv',
'journals/journal_abbreviations_webofscience-dotless.csv'
'journals/journal_abbreviations_entrez.csv',
'journals/journal_abbreviations_medicus.csv',
'journals/journal_abbreviations_webofscience-dotless.csv'
]


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
Expand Down Expand Up @@ -50,5 +51,5 @@ def main(output_filename):
filename = sys.argv[1]
else:
filename = "journalList_dotless.csv"

main(filename)
3 changes: 2 additions & 1 deletion scripts/combine_journal_lists_dots.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
'journals/journal_abbreviations_webofscience-dots.csv'
]


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
Expand Down Expand Up @@ -58,5 +59,5 @@ def main(output_filename):
filename = sys.argv[1]
else:
filename = "journalList_dots.csv"

main(filename)
1 change: 1 addition & 0 deletions scripts/convert_to_comma.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import csv


def convert_semicolon_to_comma(input_file, output_file):
with open(input_file, 'r', newline='', encoding='utf-8') as infile:
csv_reader = csv.reader(infile, delimiter=';')
Expand Down
3 changes: 2 additions & 1 deletion scripts/convert_txt2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
separator = " = " if " = " in line else "="
break
commented_lines += 1
df = pd.read_csv(fileName + ".txt", sep=separator, skiprows=commented_lines, header=None, engine="python", skipinitialspace=True, index_col=0, names=["Name", "Abbrev"])
df = pd.read_csv(fileName + ".txt", sep=separator, skiprows=commented_lines, header=None,
engine="python", skipinitialspace=True, index_col=0, names=["Name", "Abbrev"])
df.index = df.index.str.strip()
df = df.Abbrev.str.split(",", expand=True)
df.to_csv(fileName + ".csv", sep=",", header=False)
Expand Down
72 changes: 40 additions & 32 deletions scripts/delete_general_duplicates_lists.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,63 @@

import pandas as pd
import_order = [
'../journals/journal_abbreviations_acs.csv',
'../journals/journal_abbreviations_aea.csv',
'../journals/journal_abbreviations_ams.csv',
'../journals/journal_abbreviations_annee-philologique.csv',
'../journals/journal_abbreviations_astronomy.csv',
'../journals/journal_abbreviations_dainst.csv',
'../journals/journal_abbreviations_entrez.csv',
'../journals/journal_abbreviations_geology_physics.csv',
'../journals/journal_abbreviations_geology_physics_variations.csv',
'../journals/journal_abbreviations_ieee.csv',
'../journals/journal_abbreviations_ieee_strings.csv',
'../journals/journal_abbreviations_lifescience.csv',
'../journals/journal_abbreviations_mathematics.csv',
'../journals/journal_abbreviations_mechanical.csv',
'../journals/journal_abbreviations_medicus.csv',
'../journals/journal_abbreviations_meteorology.csv',
'../journals/journal_abbreviations_sociology.csv',
'../journals/journal_abbreviations_webofscience-dotless.csv',
'../journals/journal_abbreviations_webofscience-dots.csv'
'../journals/journal_abbreviations_acs.csv',
'../journals/journal_abbreviations_aea.csv',
'../journals/journal_abbreviations_ams.csv',
'../journals/journal_abbreviations_annee-philologique.csv',
'../journals/journal_abbreviations_astronomy.csv',
'../journals/journal_abbreviations_dainst.csv',
'../journals/journal_abbreviations_entrez.csv',
'../journals/journal_abbreviations_geology_physics.csv',
'../journals/journal_abbreviations_geology_physics_variations.csv',
'../journals/journal_abbreviations_ieee.csv',
'../journals/journal_abbreviations_ieee_strings.csv',
'../journals/journal_abbreviations_lifescience.csv',
'../journals/journal_abbreviations_mathematics.csv',
'../journals/journal_abbreviations_mechanical.csv',
'../journals/journal_abbreviations_medicus.csv',
'../journals/journal_abbreviations_meteorology.csv',
'../journals/journal_abbreviations_sociology.csv',
'../journals/journal_abbreviations_webofscience-dotless.csv',
'../journals/journal_abbreviations_webofscience-dots.csv'
]


def handle_bad_line(line):
print("Handle the problematic line manually:", line)


# read the csv files into dataframes
file_in = "../journals/journal_abbreviations_general.csv"
general = pd.read_csv(file_in, delimiter=',',header=None, names=["Title", "abbreviation","ShortestAbbreviation","frequency"],dtype={"Title": str,"abbreviation":str,"ShortestAbbreviation":str})
#Creating a new column Title lc which is Title in lower case for case insensitive comparison
general['Title_lc']=general['Title'].str.lower()
general = pd.read_csv(file_in, delimiter=',', header=None, names=["Title", "abbreviation", "ShortestAbbreviation", "frequency"], dtype={
"Title": str, "abbreviation": str, "ShortestAbbreviation": str})
# Creating a new column Title lc which is Title in lower case for case insensitive comparison
general['Title_lc'] = general['Title'].str.lower()

dflist=[]
dflist = []
for filename in import_order:
df = pd.read_csv(filename,delimiter=',',on_bad_lines=handle_bad_line, engine='python' ,names=["Title", "abbreviation","ShortestAbbreviation","frequency"],dtype={"Title": str,"abbreviation":str,"ShortestAbbreviation":str})
df = pd.read_csv(filename, delimiter=',', on_bad_lines=handle_bad_line, engine='python', names=[
"Title", "abbreviation", "ShortestAbbreviation", "frequency"], dtype={"Title": str, "abbreviation": str, "ShortestAbbreviation": str})
dflist.append(df)

non_general_csv_df=pd.concat(dflist,ignore_index=True)
non_general_csv_df = pd.concat(dflist, ignore_index=True)

# Remove duplicates from non_general_csv_df to avoid removing valid entries
non_general_csv_df.drop_duplicates(subset=['Title'], inplace=True,keep='first')
#Creating a new column Title lc which is Title in lower case for case insensitive comparison
non_general_csv_df['Title_lc']=non_general_csv_df['Title'].str.lower()
non_general_csv_df.drop_duplicates(
subset=['Title'], inplace=True, keep='first')
# Creating a new column Title lc which is Title in lower case for case insensitive comparison
non_general_csv_df['Title_lc'] = non_general_csv_df['Title'].str.lower()

# Merge the two dataframes on only the Title in lower case column
merged_df = pd.merge(general, non_general_csv_df, on='Title_lc', how='left', indicator=True)
merged_df = pd.merge(general, non_general_csv_df,
on='Title_lc', how='left', indicator=True)

# Keep only the rows that are present in general but not in non_general_csv_df
result_df = merged_df.loc[merged_df['_merge'] == 'left_only', ['Title_lc']]

result_df = pd.merge(general[['Title', 'abbreviation', 'ShortestAbbreviation','Title_lc']], result_df, on='Title_lc', how='inner')
#Dropping the newly added column only used for comparison
result_df = pd.merge(general[['Title', 'abbreviation', 'ShortestAbbreviation',
'Title_lc']], result_df, on='Title_lc', how='inner')
# Dropping the newly added column only used for comparison
result_df.drop('Title_lc', axis=1, inplace=True)
# Save the result dataframe to a csv file
result_df.to_csv(file_in, header=None, index=None,sep=',')
result_df.to_csv(file_in, header=None, index=None, sep=',')
9 changes: 6 additions & 3 deletions scripts/update_mathscinet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@
file_out = "journals/journal_abbreviations_mathematics.csv"

# Get the first two fields of the last version of MathSciNet data file, without empty values
df_new = pd.read_csv(file_in, usecols=[0, 1]).dropna()[["Full Title", "Abbrev"]]
df_new = pd.read_csv(file_in, usecols=[0, 1]).dropna()[
["Full Title", "Abbrev"]]

# Get our last mathematics data file
df_old = pd.read_csv(file_out, sep=",", escapechar="\\", header=None, names=["Full Title", "Abbrev"])
df_old = pd.read_csv(file_out, sep=",", escapechar="\\",
header=None, names=["Full Title", "Abbrev"])

# Concatenate, remove duplicates and sort by journal name
df = pd.concat([df_new, df_old], axis=0).drop_duplicates().sort_values(by=["Full Title", "Abbrev"])
df = pd.concat([df_new, df_old], axis=0).drop_duplicates(
).sort_values(by=["Full Title", "Abbrev"])

# Remove values where journal name is equal to abbreviation
df = df[df["Full Title"].str.lower() != df["Abbrev"].str.lower()]
Expand Down

0 comments on commit 2fc8000

Please sign in to comment.