diff --git a/derive_conceptualspace/cli/create_siddata_dataset.py b/derive_conceptualspace/cli/create_siddata_dataset.py index 2cb2a57..9f3fb17 100644 --- a/derive_conceptualspace/cli/create_siddata_dataset.py +++ b/derive_conceptualspace/cli/create_siddata_dataset.py @@ -38,7 +38,7 @@ from derive_conceptualspace.create_spaces.spaces_main import ( create_dissim_mat as create_dissim_mat_base, ) -from derive_conceptualspace.create_spaces.create_mds import ( +from derive_conceptualspace.create_spaces.create_embedding import ( create_embedding as create_embedding_base, ) from derive_conceptualspace.semantic_directions.create_candidate_svm import ( @@ -308,7 +308,7 @@ def generate_conceptualspace(ctx): """[group] CLI base to create the actual conceptual spaces""" ctx.obj["pp_descriptions"] = ctx.obj["json_persister"].load(None, "pp_descriptions", loader=pp_descriptions_loader, ignore_params=["quantification_measure", "embed_dimensions"]) ctx.obj["filtered_dcm"] = ctx.obj["json_persister"].load(None, "filtered_dcm", loader=dtm_loader, ignore_params=["quantification_measure", "embed_dimensions"]) - ctx.obj["embedding"] = ctx.obj["json_persister"].load(None, "embedding", ignore_params=["extraction_method", "dcm_quant_measure"], loader=lambda **args: args["mds"]) + ctx.obj["embedding"] = ctx.obj["json_persister"].load(None, "embedding", ignore_params=["extraction_method", "dcm_quant_measure"], loader=lambda **args: args["embedding"]) assert ctx.obj["embedding"].embedding_.shape[0] == len(ctx.obj["filtered_dcm"].dtm), f'The Doc-Candidate-Matrix contains {len(ctx.obj["filtered_dcm"].dtm)} items But your embedding has {ctx.obj["embedding"].embedding_.shape[0] } descriptions!' diff --git a/derive_conceptualspace/create_spaces/create_mds.py b/derive_conceptualspace/create_spaces/create_embedding.py similarity index 91% rename from derive_conceptualspace/create_spaces/create_mds.py rename to derive_conceptualspace/create_spaces/create_embedding.py index f170640..0e4511b 100644 --- a/derive_conceptualspace/create_spaces/create_mds.py +++ b/derive_conceptualspace/create_spaces/create_embedding.py @@ -66,7 +66,8 @@ def create_embedding(dissim_mat, embed_dimensions, embed_algo): return create_mds(dissim_mat, embed_dimensions) elif embed_algo == "tsne": return create_tsne(dissim_mat, embed_dimensions) - + else: + raise NotImplementedError(f"Algorithm {embed_algo} is not implemented!") def create_mds(dissim_mat, embed_dimensions): dtm, dissim_mat = dissim_mat @@ -79,8 +80,6 @@ def create_mds(dissim_mat, embed_dimensions): def create_tsne(dissim_mat, embed_dimensions): dtm, dissim_mat = dissim_mat - #TODO - isn't isomap better suited than MDS? https://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling - # !! [DESC15] say they compared it and it's worse ([15] of [DESC15])!!! - embedding = MDS(n_components=embed_dimensions, random_state=get_setting("RANDOM_SEED", default_none=True), dissimilarity="precomputed") - mds = embedding.fit(dissim_mat) - return mds + embedding = TSNE(n_components=embed_dimensions, random_state=get_setting("RANDOM_SEED", default_none=True), metric="precomputed") + tsne = embedding.fit(dissim_mat) + return tsne diff --git a/derive_conceptualspace/create_spaces/spaces_main.py b/derive_conceptualspace/create_spaces/spaces_main.py index ec2d7d8..f2d52db 100644 --- a/derive_conceptualspace/create_spaces/spaces_main.py +++ b/derive_conceptualspace/create_spaces/spaces_main.py @@ -1,7 +1,7 @@ from os.path import basename import logging -from .create_mds import create_dissimilarity_matrix +from .create_embedding import create_dissimilarity_matrix from derive_conceptualspace.settings import get_setting from derive_conceptualspace.util.text_tools import tf_idf, ppmi diff --git a/derive_conceptualspace/settings.py b/derive_conceptualspace/settings.py index 9547c95..6a3fea3 100644 --- a/derive_conceptualspace/settings.py +++ b/derive_conceptualspace/settings.py @@ -11,7 +11,7 @@ ALL_TRANSLATE_POLICY = ["translate"] #, "onlyeng", "origlan" ALL_EXTRACTION_METHOD = ["pp_keybert", "keybert"] ALL_QUANTIFICATION_MEASURE = ["ppmi", "tf-idf"] -ALL_EMBED_ALGO = ["t-SNE", "MDS"] +ALL_EMBED_ALGO = ["tsne", "mds"] ALL_EMBED_DIMENSIONS = [3, 100] ALL_DCM_QUANT_MEASURE = ["tf-idf", "count", "binary"] #TODO check if these and the quantification_measure are interchangeable!! (also: tag-share is missing) #TODO: try isomap & tf-idf in place of MDS diff --git a/workflow/Snakefile b/workflow/Snakefile index 0cd6a8a..4cc5464 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -33,7 +33,7 @@ from derive_conceptualspace.create_spaces.preprocess_descriptions import ( from derive_conceptualspace.create_spaces.spaces_main import ( create_dissim_mat as create_dissim_mat_base, ) -from derive_conceptualspace.create_spaces.create_mds import ( +from derive_conceptualspace.create_spaces.create_embedding import ( create_embedding as create_embedding_base, ) from derive_conceptualspace.semantic_directions.create_candidate_svm import ( @@ -67,7 +67,7 @@ class Context(): def initialize_snakemake(): - if os.getenv(f"{ENV_PREFIX}_SNAKEMAKE_TELEGRAM"): + if int(os.getenv(f"{ENV_PREFIX}_SNAKEMAKE_TELEGRAM")): if not os.getenv(f"{ENV_PREFIX}_SMK_INITIALIZED"): print("Telegaram-Notifications ON.") for k, v in dict(globals()).items(): @@ -93,7 +93,7 @@ autoloader_di = dict( dissim_mat=dtm_dissimmat_loader, doc_cand_matrix=dtm_loader, filtered_dcm=dtm_loader, - mds=lambda **args: args["mds"], + embedding=lambda **args: args["embedding"], ) def autoload_context_inputs(ctx, inputs, wildcards, params=None, input_kwargs=None): @@ -126,9 +126,9 @@ rule all: rule default: input: expand(os.sep.join(dir_struct+["clusters.json"]), - pp_components=ALL_PP_COMPONENTS[0], translate_policy=ALL_TRANSLATE_POLICY[0], quantification_measure=ALL_QUANTIFICATION_MEASURE[0], - embed_dimensions=ALL_EMBED_DIMENSIONS[0], extraction_method=ALL_EXTRACTION_METHOD[0], dcm_quant_measure=ALL_DCM_QUANT_MEASURE[0], - embed_algo=ALL_EMBED_ALGO, n_samples=n_samples) + pp_components=get_setting("PP_COMPONENTS"), translate_policy=get_setting("TRANSLATE_POLICY"), quantification_measure=get_setting("QUANTIFICATION_MEASURE"), + embed_dimensions=get_setting("EMBED_DIMENSIONS"), extraction_method=get_setting("EXTRACTION_METHOD"), dcm_quant_measure=get_setting("DCM_QUANT_MEASURE"), + embed_algo=get_setting("EMBED_ALGO"), n_samples=n_samples) rule preprocess_descriptions: