more analyses

cstenkamp · Apr 7, 2022 · 2e9038d · 2e9038d
1 parent 285d12d
commit 2e9038d
Show file tree

Hide file tree

Showing 14 changed files with 640 additions and 80 deletions.
diff --git a/.run/8) Rank using Salient Directions best siddata2022.run.xml b/.run/8) Rank using Salient Directions best siddata2022.run.xml
@@ -7,16 +7,20 @@
       <env name="PYTHONUNBUFFERED" value="1" />
       <env name="MA_DATASET" value="siddata2022" />
       <env name="MA_LANGUAGE" value="de" />
-      <env name="MA_DEBUG" value="1" />
-      <env name="MA_PP_COMPONENTS" value="mfacsd2" />
+      <env name="MA_DEBUG" value="False" />
+      <env name="MA_PP_COMPONENTS" value="mfauhcsd2" />
       <env name="MA_TRANSLATE_POLICY" value="onlyorig" />
       <env name="MA_MIN_WORDS_PER_DESC" value="80" />
       <env name="MA_QUANTIFICATION_MEASURE" value="tfidf" />
       <env name="MA_EMBED_ALGO" value="mds" />
       <env name="MA_EMBED_DIMENSIONS" value="200" />
       <env name="MA_EXTRACTION_METHOD" value="tfidf" />
       <env name="MA_DCM_QUANT_MEASURE" value="count" />
-      <env name="MA_CLASSIFIER_COMPARETO_RANKING" value="count" />
+      <env name="MA_KAPPA_WEIGHTS" value="quadratic" />
+      <env name="MA_CLASSIFIER_SUCCMETRIC" value="kappa_digitized_onlypos_2" />
+      <env name="MA_PRIM_LAMBDA" value="0.5" />
+      <env name="MA_SEC_LAMBDA" value="0.2" />
+      <env name="MA_CLUSTER_DIRECTION_ALGO" value="reclassify" />
     </envs>
     <option name="SDK_HOME" value="" />
     <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/derive_conceptualspace/cli" />

diff --git a/config/derrac2015_edited.yml b/config/derrac2015_edited.yml
@@ -9,7 +9,7 @@ translate_policy:             onlyorig
 quantification_measure:       [ppmi, tfidf]
 dissim_measure:               norm_ang_dist
 embed_algo:                   mds
-embed_dimensions:             [3, 50, 200]
+embed_dimensions:             [3, 50, 100, 200]
 extraction_method:            tfidf
 #TODO: DESC15 hat darauf abgezielt ~22k keywords zu haben => mit meinen params dafür sorgen dass ich auch auf sowas komme, mit den aktuellen ists nur 2402!
 #candidate_min_term_count:     25 #movies has samples-to-threshold value of 100, placetypes has 35, 20newsgrups has 614, so for 8000 courses any threshold from 2 to 25 seems reasonable (BUT see above, I get too little)!!
@@ -21,7 +21,7 @@ classifier:                   SVM
 kappa_weights:                quadratic
 classifier_succmetric:        [kappa_rank2rank_onlypos_min, kappa_digitized_onlypos_2, kappa_count2rank_onlypos]
 prim_lambda:                  0.5
-sec_lambda:                   0.2
+sec_lambda:                   0.1
 __perdataset__:
   placetypes:
     extraction_method: all #in the placetypes-dataset, ALL words are candidates (21.8k)

diff --git a/derive_conceptualspace/_version.py b/derive_conceptualspace/_version.py
@@ -1 +1 @@
-__version__ = "1.1.20220225"
+__version__ = "1.2.20220407"
diff --git a/derive_conceptualspace/cli/run_pipeline.py b/derive_conceptualspace/cli/run_pipeline.py
@@ -383,14 +383,73 @@ def decision_trees(ctx):
 
 @generate_conceptualspace.command()
 @click_pass_add_context
-def rank_saldirs(ctx):
+def rank_saldirs_DEPRECATED(ctx):
     ctx.obj["pp_descriptions"] = ctx.p.load(None, "pp_descriptions", loader=DescriptionList.from_json, silent=True) #TODO really silent?
     ctx.obj["featureaxes"] = ctx.p.load(None, "featureaxes", loader=featureaxes_loader)
     ctx.obj["clusters"] = ctx.p.load(None, "clusters")
     #TODO this should rather contain the code from run_pipeline.decision_trees
     rank_saldirs_base(ctx.obj["pp_descriptions"], ctx.obj["embedding"], ctx.obj["featureaxes"], ctx.obj["filtered_dcm"],
                       prim_lambda=ctx.get_config("prim_lambda"), sec_lambda=ctx.get_config("sec_lambda"), metricname=ctx.get_config("classifier_succmetric"))
 
+#TODO move me!
+import numpy as np
+def get_decisions(X_test, clf, catnames, axnames):
+    n_nodes = clf.tree_.node_count
+    children_left = clf.tree_.children_left
+    children_right = clf.tree_.children_right
+    classes = [catnames[clf.classes_[np.argmax(i)]] for i in clf.tree_.value]
+    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
+    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
+    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
+    while len(stack) > 0:
+        # `pop` ensures each node is only visited once
+        node_id, depth = stack.pop()
+        node_depth[node_id] = depth
+        # If the left and right child of a node is not the same we have a split node
+        is_split_node = children_left[node_id] != children_right[node_id]
+        # If a split node, append left and right children and depth to `stack` so we can loop through them
+        if is_split_node:
+            stack.append((children_left[node_id], depth + 1))
+            stack.append((children_right[node_id], depth + 1))
+        else:
+            is_leaves[node_id] = True
+    alls = {}
+    for i in range(n_nodes):
+        if not is_leaves[i]:
+            alls.setdefault(node_depth[i], []).append((axnames[clf.tree_.feature[i]], clf.tree_.threshold[i]))
+    return (alls[0]+alls[1]) if len(alls) > 1 else alls[0]
+
+@generate_conceptualspace.command()
+@click_pass_add_context
+def rank_saldirs(ctx):
+    from derive_conceptualspace.semantic_directions.cluster_names import get_name_dict
+    from tqdm import tqdm
+    import pandas as pd
+    from derive_conceptualspace.evaluate.shallow_trees import classify_shallowtree
+    clusters = ctx.obj["clusters"] = ctx.p.load(None, "clusters", loader=cluster_loader)
+    cluster_reprs = ctx.obj["cluster_reprs"] = ctx.p.load(None, "cluster_reprs")
+    embedding = ctx.obj["embedding"] = ctx.p.load(None, "embedding")
+    descriptions = ctx.obj["pp_descriptions"] = ctx.p.load(None, "pp_descriptions", loader=DescriptionList.from_json, silent=True) #TODO really silent?
+    embedding = embedding["embedding"].embedding_
+
+    clus_rep_algo = "top_1" #TODO obvs from config
+    cluster_names = get_name_dict(clusters["clusters"], cluster_reprs, clus_rep_algo)
+    #first I want the distances to the origins of the respective dimensions (induced by the clusters), what induces the respective rankings (see DESC15 p.24u, proj2 of load_semanticspaces.load_projections)
+    axis_dists = {i: {k: v.dist(embedding[i]) for k, v in clusters["directions"].items()} for i in tqdm(range(len(embedding)))}
+    df = pd.DataFrame(axis_dists).T
+    best_per_dim = {k: descriptions._descriptions[v].title for k, v in df.idxmax().to_dict().items()}
+
+    print("Highest-ranking descriptions per dimension:\n    "+"\n    ".join([f"*b*{cluster_names[k].rjust(max([len(cluster_names[i]) for i in best_per_dim.keys()][:20]))}*b*: {v}" for k, v in best_per_dim.items()][:20]))
+    #TODO also show places 2, 3, 4 - hier sehen wir wieder sehr ähnliche ("football stadium", "stadium", "fan" for "goalie")
+    #TODO axis_dists is all I need for the movietuner already!! I can say "give me something like X, only with more Y"
+    tr = classify_shallowtree(clusters, embedding, descriptions, ctx.obj["dataset_class"], one_vs_rest=True, dt_depth=1, test_percentage_crossval=0.33,
+                           classes="fachbereich", cluster_reprs=cluster_reprs, verbose=False, return_features=True, balance_classes=True, do_plot=False)
+    important_directions = [get_decisions(embedding, t, [i[1] for i in tr[4]], tr[-1])[0][0] for t in tr[0]]
+    best_importants = {f"{cluster_names[i]} ({j[1]})": best_per_dim[i] for i,j in zip(important_directions, tr[4])}
+    print("Highest-ranking descriptions per important dimension:\n    " + "\n    ".join(
+        [f"*b*{k.rjust(max([len(i) for i in best_importants.keys()]))}*b*: {v}" for k, v in best_importants.items()]))
+
+
 
 @cli.command()
 @click_pass_add_context
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "1.1.20220225"
		__version__ = "1.2.20220407"