Google translate Kurstitel, Docker-Container, Modelupdown included

cstenkamp · Jun 17, 2021 · 8c3bf93 · 8c3bf93
1 parent 4deb9b8
commit 8c3bf93
Show file tree

Hide file tree

Showing 20 changed files with 865 additions and 39 deletions.
diff --git a/.dockershell.sh b/.dockershell.sh
@@ -0,0 +1,4 @@
+alias ls='ls --color=auto'
+alias ll='ls -alFh'
+alias la='ls -A'
+alias l='ls -CF'
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,41 @@
+#docker build -f Dockerfile --build-arg uid=${COMPOSE_UID:-1000} --build-arg gid=${COMPOSE_GID:-1000} --rm --tag derive_conceptualspaces .
+#docker run -it --name derive_conceptualspaces_cont -v /home/chris/Documents/UNI_neu/Masterarbeit/data/:/opt/data derive_conceptualspaces bash
+#docker start derive_conceptualspaces_cont -i
+#docker container rm derive_conceptualspaces_cont -f && docker build -f Dockerfile --build-arg uid=${COMPOSE_UID:-1000} --build-arg gid=${COMPOSE_GID:-1000} --rm --tag derive_conceptualspaces .
+
+ARG PYTHON_VERSION=3.9.1
+FROM python:${PYTHON_VERSION}-buster
+
+ARG uid
+ARG gid
+
+RUN apt-get update \
+    && apt-get install -y bash git vim curl zsh htop tmux unzip nano
+
+ARG WORKDIR=/opt/derive_conceptualspaces
+COPY . ${WORKDIR}
+WORKDIR ${WORKDIR}
+ENV PYTHONPATH=${WORKDIR}
+ENV RUNNING_IN_DOCKER=1
+
+RUN ln -sf /usr/local/bin/python3 /usr/bin/python3
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+RUN python3 -m pip install --upgrade pip
+RUN ln -sf /usr/bin/pip3 /usr/bin/pip
+RUN pip install -r ./requirements.txt
+
+RUN groupadd -g ${gid} developer \
+    && useradd -g developer -u ${uid} -m developer
+USER developer
+
+#https://dev.to/arctic_hen7/setting-up-zsh-in-docker-263f
+RUN mkdir -p /home/developer/.antigen
+RUN curl -L git.io/antigen > /home/developer/.antigen/antigen.zsh
+COPY .dockershell.sh /home/developer/.zshrc
+USER root
+RUN chown -R developer:developer /home/developer/.antigen /home/developer/.zshrc
+USER developer
+RUN /bin/zsh /home/developer/.zshrc
+
+ENV HOME=/home/developer
+ENV SHELL=/bin/zsh
diff --git a/README.md b/README.md
@@ -1,3 +1,42 @@
+## How to get the data
+
+* Data comes from [DESC15] and can be obtained from http://www.cs.cf.ac.uk/semanticspaces/.
+* Download everything there, and make the directory-structure look like this:
+```
+    movies
+        classesGenres
+        classesKeywords
+        classesRatings
+        d20
+            DirectionsHeal
+            clusters20.txt
+            films20.mds
+            films20.projected
+            projections20.data
+        d50
+            ...
+        d100
+            ...
+        d200
+            ...
+        Tokens
+        filmNames.txt
+        tokens.json
+    wines
+        classes
+        d20
+            ...
+        d50
+            ...
+        d100
+            ...
+        d200
+            ...
+        Tokens
+        wineNames.txt
+    places
+        ...
+```
 ## Contributing
 
 ### Set up development-environment
@@ -11,4 +50,6 @@ nbdime config-git --enable --global
 
 ### Set up Sacred
 
-See https://sacred.readthedocs.io/en/stable/examples.html#docker-setup for the easiest way to get the MongoDB and boards to run. The */docker*-directory here is a clone of the respective *examples*-directory from the sacred-repo. To have the same `.env`-file in your local files, I can recommend PyCharm's **EnvFile**-Plugin.
+See https://sacred.readthedocs.io/en/stable/examples.html#docker-setup for the easiest way to get the MongoDB and boards to run. The */docker*-directory here is a clone of the respective *examples*-directory from the sacred-repo. To have the same `.env`-file in your local files, I can recommend PyCharm's **EnvFile**-Plugin.
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -15,11 +15,17 @@ seaborn
 ipyparams
 nbdime
 #nbdime config-git --enable --global
+langdetect
+google-cloud-translate==2.0.1
 
 python-Levenshtein #für data_exploration.ipynb in data
 python-dotenv
 
 #sacred
 sacred
 pymongo
-incense
+incense
+
+#model-downloader
+#how to install seafile-cli on your system: see https://download.seafile.com/published/seafile-user-manual/syncing_client/install_linux_client.md
+git+https://github.com/cstenkamp/python-seafile.git@v0.1.2#egg=python_seafile #on pypi there's only v0.1.0, which is broken, and even the original repo has an error with binary files
diff --git a/scripts/create_siddata_dataset.py b/scripts/create_siddata_dataset.py
@@ -2,7 +2,7 @@
 available at http://www.cs.cf.ac.uk/semanticspaces/. Meaning: MDS, ..."""
 
 #TODO make (snakemake?) Pipeline that runs start to finish and creates the complete directory
-
+import hashlib
 from os.path import join, isfile, dirname, basename
 import re
 import random
@@ -11,12 +11,18 @@
 
 import numpy as np
 import pandas as pd
-
-from src.static.settings import SID_DATA_BASE, DEBUG, RANDOM_SEED, DATA_BASE
-from main.util.logging import setup_logging
+from langdetect import detect
+from langdetect.lang_detect_exception import LangDetectException
+from tqdm import tqdm
+import html
+import json
+
+from src.static.settings import SID_DATA_BASE, DEBUG, RANDOM_SEED, SPACES_DATA_BASE
+from src.main.util.logutils import setup_logging
 from src.main.util.pretty_print import pretty_print as print
 from src.main.load_data.siddata_data_prep.create_mds import preprocess_data
 from src.main.load_data.siddata_data_prep.jsonloadstore import json_dump, json_load
+from src.main.util.google_translate import translate_text
 
 logger = logging.getLogger(basename(__file__))
 
@@ -25,16 +31,46 @@
 def main():
     setup_logging("INFO")
     random.seed(RANDOM_SEED)
-    for n_dims in [20, 100]: #[20,50,100,200]: #TODO #PRECOMMIT
-        create_dataset(n_dims, "courses")
+    # for n_dims in [20, 100]: #[20,50,100,200]: #TODO #PRECOMMIT
+    #     create_dataset(n_dims, "courses")
+    translate_descriptions()
+
+
+def translate_descriptions():
+    names, descriptions, mds = load_mds(join(SID_DATA_BASE, f"siddata_names_descriptions_mds_20.json"))
+    assert len(set(names)) == len(names)
+    descriptions = [html.unescape(i) for i in descriptions]
+    name_desc = dict(zip(names, descriptions))
+    if isfile((translationsfile := join(SID_DATA_BASE, "translated_descriptions.json"))):
+        with open(translationsfile, "r") as rfile:
+            translateds = json.load(rfile)
+    else:
+        translateds = {}
+    unknown = {}
+    print("Checking Language of descriptions...")
+    for name, desc in tqdm(name_desc.items()):
+        if name not in translateds:
+            try:
+                if (lan := detect(desc)) != "en":
+                    unknown[name] = [desc, lan]
+            except LangDetectException as e:
+                unknown[name] = [desc, "unk"]
+    print(f"There are {len(''.join([i[0] for i in unknown.values()]))} signs to be translated.")
+    to_translate = [i for i in unknown.keys() if i not in translateds]
+    translations = translate_text([unknown[i][0] for i in to_translate])
+    # hash_translates = dict(zip([hashlib.sha256(i.encode("UTF-8")).hexdigest() for i in to_translate], translations))
+    translateds.update(dict(zip(to_translate, translations)))
+    with open(join(SID_DATA_BASE, "translated_descriptions.json"), "w") as wfile:
+        json.dump(translateds, wfile)
+    print()
 
 
 def create_dataset(n_dims, dsetname):
     # assert not DEBUG #TODO #PRECOMMIT
     names, descriptions, mds = create_mds(join(SID_DATA_BASE, f"siddata_names_descriptions_mds_{n_dims}.json"), n_dims=n_dims)
         # names, descriptions, mds = load_mds(join(SID_DATA_BASE, f"siddata_names_descriptions_mds_{n_dims}.json")) #TODO #PRECOMMIT comment out other line
     display_mds(mds, names)
-    fname = join(DATA_BASE, dsetname, f"d{n_dims}", f"{dsetname}{n_dims}.mds")
+    fname = join(SPACES_DATA_BASE, dsetname, f"d{n_dims}", f"{dsetname}{n_dims}.mds")
     os.makedirs(dirname(fname), exist_ok=True)
     embedding = list(mds.embedding_)
     # indices = np.argsort(np.array(names))

diff --git a/scripts/derive_conceptualspaces.py b/scripts/derive_conceptualspaces.py
@@ -0,0 +1,12 @@
+"""Section 4.2 in DESC15"""
+from src.static.settings import SPACES_DATA_BASE, DATA_SET, MDS_DIMENSIONS
+from src.main.load_data.load_semanticspaces import load_mds_representation, get_names, get_grouped_candidates
+
+def main():
+    mds, mds_path = load_mds_representation(SPACES_DATA_BASE, DATA_SET, MDS_DIMENSIONS)
+    names, names_path = get_names(SPACES_DATA_BASE, DATA_SET)
+    candidates, group_vectors = get_grouped_candidates(SPACES_DATA_BASE, DATA_SET, MDS_DIMENSIONS)
+    print()
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/display_courses_betweenness.py b/scripts/display_courses_betweenness.py
@@ -1,9 +1,9 @@
 from os.path import join
 
 from src.main.load_data.load_semanticspaces import load_mds_representation, get_names
-from src.static.settings import DATA_BASE, DATA_SET, MDS_DIMENSIONS
+from src.static.settings import SPACES_DATA_BASE, MDS_DIMENSIONS
 from src.main.measures import between_a
-from main.util.logging import setup_logging
+from src.main.util.logutils import setup_logging
 from src.test.test_semanticspaces_measures import find_betweenness_position
 
 SOME_IDS = {"Computer Vision": 4155, "Computergrafik": 547, "Computergrafikpraktikum": 453, "Machine Learning": 1685, "Rechnernetzepraktikum": 1921}
@@ -22,8 +22,8 @@ def get_descriptions():
 
 
 def show_betwennesses():
-    mds = load_mds_representation(DATA_BASE, DATA_SET, MDS_DIMENSIONS)[0]
-    names = get_names(DATA_BASE, DATA_SET)[0]
+    mds = load_mds_representation(SPACES_DATA_BASE, "courses", MDS_DIMENSIONS)[0]
+    names = get_names(SPACES_DATA_BASE, "courses")[0]
     name_mds = dict(zip(names, mds))
     candidates = [("Computergrafik", "Computer Vision", "Machine Learning"), ("Rechnernetzepraktikum", "Computergrafik", "Computergrafikpraktikum")]
     descriptions = get_descriptions()

diff --git a/...ts/notebooks/visualize_siddata_data.ipynb → scripts/notebooks/visualize_mds_data.ipynb b/...ts/notebooks/visualize_siddata_data.ipynb → scripts/notebooks/visualize_mds_data.ipynb
@@ -16,7 +16,7 @@
     "import plotly.graph_objects as go\n",
     "import pandas as pd\n",
     "\n",
-    "from src.static.settings import MONGO_URI, ENV_FILE_PATH, DATA_BASE\n",
+    "from src.static.settings import MONGO_URI, ENV_FILE_PATH, SPACES_DATA_BASE\n",
     "from src.main.load_data.load_semanticspaces import load_mds_representation, get_names, get_classes\n",
     "from scripts.create_siddata_dataset import display_mds #TODO display in scripts?!"
    ]
@@ -83,12 +83,12 @@
     "    latest_tsne = [i for i in tsne_exps if i.status == \"COMPLETED\" and i.config.get(\"tsne_dims\") == tsne_dim and i.config.get(\"mds_dimensions\") == mds_dim and i.config.get(\"data_set\") == data_set][-1]\n",
     "\n",
     "    tsne_arr = pd.read_csv(StringIO(latest_tsne.artifacts[\"tSNE\"].content.decode(\"UTF-8\")))\n",
-    "    classes = get_classes(DATA_BASE, data_set, what=cat_name)\n",
+    "    classes = get_classes(SPACES_DATA_BASE, data_set, what=cat_name)\n",
     "    tsne_arr[\"Category\"] = [val[0] if val else \"None\" for key, val in classes.items()]\n",
     "\n",
     "    #TODO ACTUALLY these should be loaded from the experiment as well, but add_resource doesn't let me provide names\n",
-    "    mds, _ = load_mds_representation(DATA_BASE, data_set, mds_dim)\n",
-    "    names, _ = get_names(DATA_BASE, data_set)\n",
+    "    mds, _ = load_mds_representation(SPACES_DATA_BASE, data_set, mds_dim)\n",
+    "    names, _ = get_names(SPACES_DATA_BASE, data_set)\n",
     "\n",
     "    display_mds(mds, names, max_elems=2)\n",
     "\n",

diff --git a/scripts/sacred/create_tsne.py b/scripts/sacred/create_tsne.py
@@ -6,7 +6,7 @@
 from sklearn.manifold import TSNE
 
 from os.path import join
-from src.static.settings import DATA_BASE, DATA_DUMP_DIR, MONGO_URI
+from src.static.settings import SPACES_DATA_BASE, DATA_DUMP_DIR, MONGO_URI
 from src.main.load_data.load_semanticspaces import load_mds_representation, get_names
 
 ########################################################################################################################
@@ -40,8 +40,8 @@ def make_tsne_df(mds, names, n_dims=3):
 def main(mds_dimensions, data_set, tsne_dims):
     exp_inf_str = "__".join([f"{key}_{val}" for key, val in cfg().items()])
     dump_name = join(DATA_DUMP_DIR, f"tsne_{exp_inf_str}.csv")
-    mds, mds_path = load_mds_representation(DATA_BASE, data_set, mds_dimensions)
-    names, names_path = get_names(DATA_BASE, data_set)
+    mds, mds_path = load_mds_representation(SPACES_DATA_BASE, data_set, mds_dimensions)
+    names, names_path = get_names(SPACES_DATA_BASE, data_set)
     ex.add_resource(mds_path)
     ex.add_resource(names_path)
     df = make_tsne_df(mds, names, tsne_dims)

diff --git a/scripts/upload_model.py b/scripts/upload_model.py
@@ -0,0 +1,15 @@
+import os
+from src.main.util.model_downloader_seafile import get_write_account_data, SeafileModelSyncer, model_downloader_logger
+from src.static import settings
+
+model_downloader_logger.setLevel("INFO")
+localpath = settings.DATA_BASE
+account, password, server, repoid, repopath, modelversions = get_write_account_data()
+modelsyncer = SeafileModelSyncer(server, account, password, repoid, repopath)
+if modelsyncer.repo is not None:
+    print("Do you really want to upload the following:")
+    for mname, mversion in modelversions.items():
+        if mversion is not None:
+            print(f"{mname} in version {mversion}")
+    if input("? [y/n]").lower() == "y":
+        modelsyncer.upload_modeldirs(localpath, modelversions, overwrite_version=False)
diff --git a/src/main/load_data/dataset_specifics/courses.py b/src/main/load_data/dataset_specifics/courses.py
@@ -14,6 +14,6 @@ def get_classes(data_base, what):
 
 
 if __name__ == "__main__":
-    from src.static.settings import DATA_BASE
-    tmp = get_classes(DATA_BASE, "Fachbereich")
+    from src.static.settings import SPACES_DATA_BASE
+    tmp = get_classes(SPACES_DATA_BASE, "Fachbereich")
     print(tmp)
diff --git a/src/main/load_data/dataset_specifics/movies.py b/src/main/load_data/dataset_specifics/movies.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from main.load_data.load_semanticspaces import get_names
+from src.main.load_data.load_semanticspaces import get_names
 
 ORDER = ['Musical', 'Music', 'Documentary', 'Western', 'Animation', 'War', 'History', 'Sci-Fi', 'Horror', 'Sport', 'Biography', 'Film-Noir', 'News', 'Fantasy', 'Adult', 'Crime', 'Thriller', 'Comedy', 'Romance', 'Action', 'Mystery', 'Adventure', 'Drama', 'Family', 'Short']
 #this order is supposed to be roughly sorted by informativeness, such that a movie that is both "Musical" and "Family", if only one label is supposed to be picked, will rather be the more informative "Musical"
@@ -21,8 +21,17 @@ def get_classes(data_base, what):
     return classes
 
 
+def get_candidateterms(data_base, data_set, n_dims, **kwargs):
+    dir = join(data_base, data_set, f"d{n_dims}", "DirectionsHeal")
+    vecnames = [i for i in os.listdir(dir) if i.endswith(".vector")]
+    vectors = [np.loadtxt(join(dir, i)) for i in vecnames]
+    vecnames = [i[:-len(".vector")] for i in vecnames]
+    vecnames2 = [list(zip(*[j.split("_") for j in i.split(" ")])) for i in vecnames]
+    words, poss = [" ".join(i[0]) for i in vecnames2], [" ".join(i[1]) if len(i) > 1 else None for i in vecnames2]
+    return words, poss, vectors, vecnames
+
 
 if __name__ == "__main__":
-    from src.static.settings import DATA_BASE
-    tmp = get_classes(DATA_BASE, "Genres")
+    from src.static.settings import SPACES_DATA_BASE
+    tmp = get_classes(SPACES_DATA_BASE, "Genres")
     print(tmp)