Skip to content

Commit

Permalink
pin version numbers for more reproducibility on server
Browse files Browse the repository at this point in the history
  • Loading branch information
cstenkamp committed Dec 29, 2021
1 parent 54923be commit abd192e
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 51 deletions.
27 changes: 20 additions & 7 deletions derive_conceptualspace/cli/create_siddata_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from derive_conceptualspace.util.dtm_object import dtm_dissimmat_loader, dtm_loader

logger = logging.getLogger(basename(__file__))
flatten = lambda l: [item for sublist in l for item in sublist]

########################################################################################################################
########################################################################################################################
Expand Down Expand Up @@ -142,7 +143,11 @@ def wrapped(*args, **kwargs):
@click.option("--notify-telegram/--no-notify-telegram", default=False, help="If you want to get telegram-notified of start & end of the command")
@click_pass_add_context
def cli(ctx):
import derive_conceptualspace.settings
print("Starting up at", datetime.now().strftime("%d.%m.%Y, %H:%M:%S"))
all_params = {i: get_setting(i.upper(), stay_silent=True, silent=True) for i in get_jsonpersister_args()[0]}
default_params = {k[len("DEFAULT_"):].lower():v for k,v in derive_conceptualspace.settings.__dict__.items() if k in ["DEFAULT_"+i.upper() for i in all_params.keys()]}
print("Running with the following settings:", ", ".join([f"{k}: *{'b' if v==default_params[k] else 'r'}*{v}*{'b' if v==default_params[k] else 'r'}*" for k, v in all_params.items()]))
setup_logging(ctx.obj["log"], ctx.obj["logfile"])
set_debug(ctx)
ctx.obj["json_persister"] = setup_json_persister(ctx)
Expand Down Expand Up @@ -335,10 +340,19 @@ def show_data_info(ctx):
print("Relevant Metainfo:", ", ".join([f"{k}: *b*{v}*b*" for k, v in ctx.obj["json_persister"].loaded_relevant_metainf.items()]))
data_dirs = {k: v[1].replace(ctx.obj["json_persister"].in_dir, "data_dir/") for k, v in ctx.obj["json_persister"].loaded_objects.items()}
print("Directories:\n ", "\n ".join(f"{k.rjust(max(len(i) for i in data_dirs))}: {v}" for k,v in data_dirs.items()))
dependencies = {k.replace("preprocessed_descriptions","pp_descriptions"): set([i.replace("preprocessed_descriptions","pp_descriptions") for i in v[2] if i != "this"]) for k,v in ctx.obj["json_persister"].loaded_objects.items()}
dependencies = {k: set([i for i in v[2] if i != "this"]) for k,v in ctx.obj["json_persister"].loaded_objects.items()}
#figuring out when a new param was first necessary
param_intro = {k: v[3].get("relevant_params") if v[3] else None for k, v in ctx.obj["json_persister"].loaded_objects.items()}
newparam = {}
for key, val in {k: list(v.keys()) for k, v in param_intro.items() if v}.items():
for elem in val:
if elem not in flatten(newparam.values()):
newparam.setdefault(key, []).append(elem)
#/figuring out when a new param was first necessary
dot = Digraph()
for key in dependencies:
dot.node(key, key)
add_txt = "\n ".join([f"{el}: {ctx.obj['json_persister'].loaded_relevant_params[el]}" for el in newparam.get(key, [])])
dot.node(key, key+("\n\n "+add_txt if add_txt else ""))
dot.edges([[k, e] for k, v in dependencies.items() for e in v])
# print(dot.source) #TODO save to file
if ctx.obj["verbose"]:
Expand All @@ -357,6 +371,7 @@ def show_data_info(ctx):
dates = {k2:v2 for k2,v2 in {k: v[3]["date"] if isinstance(v[3], dict) and "date" in v[3] else None for k,v in ctx.obj["json_persister"].loaded_objects.items()}.items() if v2 is not None}
print("Dates:\n ", "\n ".join(f"{k.rjust(max(len(i) for i in dates))}: {v}" for k,v in dates.items()))


@generate_conceptualspace.command()
@click_pass_add_context
def rank_courses_saldirs(ctx):
Expand All @@ -365,16 +380,14 @@ def rank_courses_saldirs(ctx):
desc.embedding = embedding
print()

#

# @prepare_candidateterms.command()
# @click.pass_context
# @click.argument("dtm-filename", type=str)
# @telegram_notify(only_terminal=True, only_on_fail=False, log_start=True)
# @click_pass_add_context
# # @telegram_notify(only_terminal=True, only_on_fail=False, log_start=True)
# def run_lsi(ctx, dtm_filename):
# """as in [VISR12: 4.2.1]"""
# # TODO options here:
# # * if it should filter AFTER the LSI
# from derive_conceptualspace.util.jsonloadstore import json_load
# import numpy as np
# from derive_conceptualspace.util.dtm_object import DocTermMatrix
# from os.path import splitext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def create_candidate_svms(dcm, mds, pp_descriptions, verbose):
return clusters, cluster_directions, dict(sorted_kappa), {k: (v.intercept, list(v.normal)) for k,v in decision_planes.items()}


def select_salient_terms(sorted_kappa, decision_planes, prim_lambda=0.24, sec_lambda=0.18):
def select_salient_terms(sorted_kappa, decision_planes, prim_lambda=None, sec_lambda=None):
prim_lambda = prim_lambda or get_setting("PRIM_LAMBDA")
sec_lambda = sec_lambda or get_setting("SEC_LAMBDA")
#TODO waitwaitwait. Am I 100% sure that the intercepts of the decision_planes are irrelevant?!
get_tlambda = lambda sorted_kappa, lamb: [i[0] for i in sorted_kappa if i[1] > lamb]
get_tlambda2 = lambda sorted_kappa, primlamb, seclamb: list(set(get_tlambda(sorted_kappa, seclamb))-set(get_tlambda(sorted_kappa, primlamb)))
Expand Down
15 changes: 10 additions & 5 deletions derive_conceptualspace/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
ALL_TRANSLATE_POLICY = ["translate"] #, "onlyeng", "origlan"
ALL_EXTRACTION_METHOD = ["pp_keybert", "keybert"]
ALL_QUANTIFICATION_MEASURE = ["ppmi", "tf-idf"]
ALL_EMBED_ALGO = ["tsne", "mds"]
ALL_EMBED_DIMENSIONS = [3, 100]
ALL_DCM_QUANT_MEASURE = ["tf-idf", "count", "binary"] #TODO check if these and the quantification_measure are interchangeable!! (also: tag-share is missing)
#TODO: try isomap & tf-idf in place of MDS
ALL_EMBED_ALGO = ["mds", "tsne"]
ALL_EMBED_DIMENSIONS = [100, 50]#, 200, 3]
ALL_DCM_QUANT_MEASURE = ["tf-idf", "count"]#, "binary"] #TODO check if these and the quantification_measure are interchangeable!! (also: tag-share is missing)
#TODO: try isomap & tsne in place of MDS


#set default-values for the ALL_... variables
Expand All @@ -27,6 +27,9 @@
DEFAULT_DEBUG_N_ITEMS = 100
DEFAULT_CANDIDATE_MIN_TERM_COUNT = 25
DEFAULT_FASTER_KEYBERT = False

DEFAULT_PRIM_LAMBDA = 0.45
DEFAULT_SEC_LAMBDA = 0.3
################ /new stuff #################

DEFAULT_STANFORDNLP_VERSION = "4.2.2" #whatever's newest at https://stanfordnlp.github.io/CoreNLP/history.html
Expand Down Expand Up @@ -100,12 +103,14 @@ def get_envvar(envvarname):
return None


def get_setting(name, default_none=False, silent=False, set_env_from_default=False):
def get_setting(name, default_none=False, silent=False, set_env_from_default=False, stay_silent=False):
suppress_further = True if not silent else True if stay_silent else False
if get_envvar(ENV_PREFIX+"_"+name) is not None:
return get_envvar(ENV_PREFIX+"_"+name) if get_envvar(ENV_PREFIX+"_"+name) != "none" else None
if "DEFAULT_"+name in globals():
if not silent and not get_envvar(ENV_PREFIX+"_"+name+"_shutup"):
print(f"returning setting for {name} from default value: {globals()['DEFAULT_'+name]}")
if suppress_further and not get_envvar(ENV_PREFIX + "_" + name + "_shutup"):
set_envvar(ENV_PREFIX+"_"+name+"_shutup", True)
if set_env_from_default:
set_envvar(ENV_PREFIX+"_"+name, globals()['DEFAULT_'+name])
Expand Down
4 changes: 2 additions & 2 deletions derive_conceptualspace/util/jsonloadstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def load(self, filename, save_basename, relevant_metainf=None, ignore_params=Non
# print(f"Loading {tmp['basename']}: {self.loaded_objects[k][2]}")
if k not in self.loaded_objects: self.loaded_objects[k] = v
elif tmp["basename"] in v[2]:
assert str(self.loaded_objects[k][3]) == str(v[3])
assert str({k:v for k,v in self.loaded_objects[k][3].items() if k not in ["relevant_params", "relevant_metainf"]}) == str(v[3])
self.loaded_objects[k][2].extend(v[2])
# the pp_descriptions are used in candidate_terms AND in postprocess_candidates. So when pp_cands loads stuff, it needs to note that pp_descriptions were used in boht.
elif k in self.loaded_objects:
Expand All @@ -262,7 +262,7 @@ def load(self, filename, save_basename, relevant_metainf=None, ignore_params=Non
assert k in complete_metainf, f"The file `{tmp['basename']}` required the relevant-meta-inf `{k}`, but you don't have a value for this!"
assert complete_metainf[k] in [v, "ANY"], f"The file `{tmp['basename']}` required the relevant-meta-inf `{k}` to be `{v}`, but here it is `{complete_metainf[k]}`!"
obj = tmp["object"] if "object" in tmp else tmp
obj_info = tmp.get("obj_info")
obj_info = {**tmp.get("obj_info"), "relevant_params": tmp.get("relevant_params", {}), "relevant_metainf": tmp.get("relevant_metainf", {})}
if loader is not None:
obj = loader(**obj)
for k, v in self.loaded_relevant_params.items():
Expand Down
6 changes: 3 additions & 3 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pytest
nbstripout
pytest==6.2.3
nbstripout==0.3.9
# nbstripout --install --attributes .gitattributes
nbdime
nbdime==3.1.0
48 changes: 24 additions & 24 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
numpy
jupyter
jupyterlab
matplotlib
scipy
pandas
nltk
HanTa #Lemmatizer for german
tqdm
sklearn
seaborn
ipyparams
langdetect
numpy==1.20.3
jupyter==1.0.0
jupyterlab==3.0.14
matplotlib==3.4.1
scipy==1.7.1
pandas==1.2.4
nltk==3.6.2
HanTa==0.2.0 #Lemmatizer for german
tqdm==4.60.0
scikit-learn==1.0
seaborn==0.11.1
ipyparams==0.2.1
langdetect==1.0.9
google-cloud-translate==2.0.1
click
stanfordcorenlp
keybert
unidecode
click==7.1.2
stanfordcorenlp==3.9.1.1
keybert==0.4.0
unidecode==1.3.2

python-Levenshtein #für data_exploration.ipynb in data
python-dotenv
python-Levenshtein==0.12.2 #für data_exploration.ipynb in data
python-dotenv==0.17.1

#sacred
sacred
pymongo
incense
sacred==0.8.2
pymongo==3.11.4
incense==0.0.11


#model-downloader
Expand All @@ -33,5 +33,5 @@ git+https://github.com/cstenkamp/python-seafile.git@v0.1.2#egg=python_seafile #o
plotly==4.14.3
#to render in jupyterlab install jupyterlab extension: conda run -n Derive_Conceptualspace python -m jupyter labextension uninstall jupyterlab-plotly@5.3.1

snakemake
gensim
snakemake==6.9.1
gensim==4.1.2
19 changes: 10 additions & 9 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""
How to plot DAG as graph:
`PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -np --directory $MA_DATA_DIR --dag | grep -A99999 "digraph" | dot -Tsvg > dag.svg`
How to run for 1 MDS:
`MA_DEBUG=1 PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -p --directory $MA_DATA_DIR default`
or `MA_DEBUG=1 PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -p --directory $MA_DATA_DIR tcsldp_translate/ppmi_3d/pp_keybert_count/clusters.json`
you can also: `(export $(cat $MA_ENV_FILE | xargs) && PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -p --directory $MA_DATA_DIR default)`
If run on server:
`rsync -az --progress $MA_DATA_DIR etlpipelines:~/data --exclude .snakemake`
Plot DAG: `PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -np --directory $MA_DATA_DIR --dag | grep -A99999 "digraph" | dot -Tsvg > dag.svg`
Run default: `MA_DEBUG=1 PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -p --directory $MA_DATA_DIR default`
or `MA_DEBUG=1 PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -p --directory $MA_DATA_DIR tcsldp_translate/ppmi_3d/pp_keybert_count/clusters.json`
or: `(export $(cat $MA_ENV_FILE | xargs) && PYTHONPATH=$(realpath .):$PYTHONPATH snakemake --cores 1 -p --directory $MA_DATA_DIR default)`
ALL Combis: `ma_cont snakemake --cores 1 -p --directory /opt/data all --keep-going`
Get results: `rsync -az --progress $MA_DATA_DIR etlpipelines:~/data --exclude .snakemake`
"""

#TODO before writing about it: have a way of not having to set the PYTHONPATH as cmd-arg

import os

from snakemake.io import expand
Expand Down Expand Up @@ -67,7 +68,7 @@ class Context():


def initialize_snakemake():
if int(os.getenv(f"{ENV_PREFIX}_SNAKEMAKE_TELEGRAM")):
if int(os.getenv(f"{ENV_PREFIX}_SNAKEMAKE_TELEGRAM", 0)):
if not os.getenv(f"{ENV_PREFIX}_SMK_INITIALIZED"):
print("Telegaram-Notifications ON.")
for k, v in dict(globals()).items():
Expand Down

0 comments on commit abd192e

Please sign in to comment.