Skip to content

Commit

Permalink
add tsne as embedding algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
cstenkamp committed Dec 28, 2021
1 parent 1f0fe7e commit ba036ca
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 16 deletions.
4 changes: 2 additions & 2 deletions derive_conceptualspace/cli/create_siddata_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from derive_conceptualspace.create_spaces.spaces_main import (
create_dissim_mat as create_dissim_mat_base,
)
from derive_conceptualspace.create_spaces.create_mds import (
from derive_conceptualspace.create_spaces.create_embedding import (
create_embedding as create_embedding_base,
)
from derive_conceptualspace.semantic_directions.create_candidate_svm import (
Expand Down Expand Up @@ -308,7 +308,7 @@ def generate_conceptualspace(ctx):
"""[group] CLI base to create the actual conceptual spaces"""
ctx.obj["pp_descriptions"] = ctx.obj["json_persister"].load(None, "pp_descriptions", loader=pp_descriptions_loader, ignore_params=["quantification_measure", "embed_dimensions"])
ctx.obj["filtered_dcm"] = ctx.obj["json_persister"].load(None, "filtered_dcm", loader=dtm_loader, ignore_params=["quantification_measure", "embed_dimensions"])
ctx.obj["embedding"] = ctx.obj["json_persister"].load(None, "embedding", ignore_params=["extraction_method", "dcm_quant_measure"], loader=lambda **args: args["mds"])
ctx.obj["embedding"] = ctx.obj["json_persister"].load(None, "embedding", ignore_params=["extraction_method", "dcm_quant_measure"], loader=lambda **args: args["embedding"])
assert ctx.obj["embedding"].embedding_.shape[0] == len(ctx.obj["filtered_dcm"].dtm), f'The Doc-Candidate-Matrix contains {len(ctx.obj["filtered_dcm"].dtm)} items But your embedding has {ctx.obj["embedding"].embedding_.shape[0] } descriptions!'


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def create_embedding(dissim_mat, embed_dimensions, embed_algo):
return create_mds(dissim_mat, embed_dimensions)
elif embed_algo == "tsne":
return create_tsne(dissim_mat, embed_dimensions)

else:
raise NotImplementedError(f"Algorithm {embed_algo} is not implemented!")

def create_mds(dissim_mat, embed_dimensions):
dtm, dissim_mat = dissim_mat
Expand All @@ -79,8 +80,6 @@ def create_mds(dissim_mat, embed_dimensions):

def create_tsne(dissim_mat, embed_dimensions):
dtm, dissim_mat = dissim_mat
#TODO - isn't isomap better suited than MDS? https://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
# !! [DESC15] say they compared it and it's worse ([15] of [DESC15])!!!
embedding = MDS(n_components=embed_dimensions, random_state=get_setting("RANDOM_SEED", default_none=True), dissimilarity="precomputed")
mds = embedding.fit(dissim_mat)
return mds
embedding = TSNE(n_components=embed_dimensions, random_state=get_setting("RANDOM_SEED", default_none=True), metric="precomputed")
tsne = embedding.fit(dissim_mat)
return tsne
2 changes: 1 addition & 1 deletion derive_conceptualspace/create_spaces/spaces_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from os.path import basename
import logging

from .create_mds import create_dissimilarity_matrix
from .create_embedding import create_dissimilarity_matrix

from derive_conceptualspace.settings import get_setting
from derive_conceptualspace.util.text_tools import tf_idf, ppmi
Expand Down
2 changes: 1 addition & 1 deletion derive_conceptualspace/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
ALL_TRANSLATE_POLICY = ["translate"] #, "onlyeng", "origlan"
ALL_EXTRACTION_METHOD = ["pp_keybert", "keybert"]
ALL_QUANTIFICATION_MEASURE = ["ppmi", "tf-idf"]
ALL_EMBED_ALGO = ["t-SNE", "MDS"]
ALL_EMBED_ALGO = ["tsne", "mds"]
ALL_EMBED_DIMENSIONS = [3, 100]
ALL_DCM_QUANT_MEASURE = ["tf-idf", "count", "binary"] #TODO check if these and the quantification_measure are interchangeable!! (also: tag-share is missing)
#TODO: try isomap & tf-idf in place of MDS
Expand Down
12 changes: 6 additions & 6 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ from derive_conceptualspace.create_spaces.preprocess_descriptions import (
from derive_conceptualspace.create_spaces.spaces_main import (
create_dissim_mat as create_dissim_mat_base,
)
from derive_conceptualspace.create_spaces.create_mds import (
from derive_conceptualspace.create_spaces.create_embedding import (
create_embedding as create_embedding_base,
)
from derive_conceptualspace.semantic_directions.create_candidate_svm import (
Expand Down Expand Up @@ -67,7 +67,7 @@ class Context():


def initialize_snakemake():
if os.getenv(f"{ENV_PREFIX}_SNAKEMAKE_TELEGRAM"):
if int(os.getenv(f"{ENV_PREFIX}_SNAKEMAKE_TELEGRAM")):
if not os.getenv(f"{ENV_PREFIX}_SMK_INITIALIZED"):
print("Telegaram-Notifications ON.")
for k, v in dict(globals()).items():
Expand All @@ -93,7 +93,7 @@ autoloader_di = dict(
dissim_mat=dtm_dissimmat_loader,
doc_cand_matrix=dtm_loader,
filtered_dcm=dtm_loader,
mds=lambda **args: args["mds"],
embedding=lambda **args: args["embedding"],
)

def autoload_context_inputs(ctx, inputs, wildcards, params=None, input_kwargs=None):
Expand Down Expand Up @@ -126,9 +126,9 @@ rule all:
rule default:
input:
expand(os.sep.join(dir_struct+["clusters.json"]),
pp_components=ALL_PP_COMPONENTS[0], translate_policy=ALL_TRANSLATE_POLICY[0], quantification_measure=ALL_QUANTIFICATION_MEASURE[0],
embed_dimensions=ALL_EMBED_DIMENSIONS[0], extraction_method=ALL_EXTRACTION_METHOD[0], dcm_quant_measure=ALL_DCM_QUANT_MEASURE[0],
embed_algo=ALL_EMBED_ALGO, n_samples=n_samples)
pp_components=get_setting("PP_COMPONENTS"), translate_policy=get_setting("TRANSLATE_POLICY"), quantification_measure=get_setting("QUANTIFICATION_MEASURE"),
embed_dimensions=get_setting("EMBED_DIMENSIONS"), extraction_method=get_setting("EXTRACTION_METHOD"), dcm_quant_measure=get_setting("DCM_QUANT_MEASURE"),
embed_algo=get_setting("EMBED_ALGO"), n_samples=n_samples)


rule preprocess_descriptions:
Expand Down

0 comments on commit ba036ca

Please sign in to comment.