Skip to content

Commit

Permalink
Google translate Kurstitel, Docker-Container, Modelupdown included
Browse files Browse the repository at this point in the history
  • Loading branch information
cstenkamp committed Jun 17, 2021
1 parent 4deb9b8 commit 8c3bf93
Show file tree
Hide file tree
Showing 20 changed files with 865 additions and 39 deletions.
4 changes: 4 additions & 0 deletions .dockershell.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
alias ls='ls --color=auto'
alias ll='ls -alFh'
alias la='ls -A'
alias l='ls -CF'
41 changes: 41 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#docker build -f Dockerfile --build-arg uid=${COMPOSE_UID:-1000} --build-arg gid=${COMPOSE_GID:-1000} --rm --tag derive_conceptualspaces .
#docker run -it --name derive_conceptualspaces_cont -v /home/chris/Documents/UNI_neu/Masterarbeit/data/:/opt/data derive_conceptualspaces bash
#docker start derive_conceptualspaces_cont -i
#docker container rm derive_conceptualspaces_cont -f && docker build -f Dockerfile --build-arg uid=${COMPOSE_UID:-1000} --build-arg gid=${COMPOSE_GID:-1000} --rm --tag derive_conceptualspaces .

ARG PYTHON_VERSION=3.9.1
FROM python:${PYTHON_VERSION}-buster

ARG uid
ARG gid

RUN apt-get update \
&& apt-get install -y bash git vim curl zsh htop tmux unzip nano

ARG WORKDIR=/opt/derive_conceptualspaces
COPY . ${WORKDIR}
WORKDIR ${WORKDIR}
ENV PYTHONPATH=${WORKDIR}
ENV RUNNING_IN_DOCKER=1

RUN ln -sf /usr/local/bin/python3 /usr/bin/python3
RUN ln -sf /usr/bin/python3 /usr/bin/python
RUN python3 -m pip install --upgrade pip
RUN ln -sf /usr/bin/pip3 /usr/bin/pip
RUN pip install -r ./requirements.txt

RUN groupadd -g ${gid} developer \
&& useradd -g developer -u ${uid} -m developer
USER developer

#https://dev.to/arctic_hen7/setting-up-zsh-in-docker-263f
RUN mkdir -p /home/developer/.antigen
RUN curl -L git.io/antigen > /home/developer/.antigen/antigen.zsh
COPY .dockershell.sh /home/developer/.zshrc
USER root
RUN chown -R developer:developer /home/developer/.antigen /home/developer/.zshrc
USER developer
RUN /bin/zsh /home/developer/.zshrc

ENV HOME=/home/developer
ENV SHELL=/bin/zsh
43 changes: 42 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,42 @@
## How to get the data

* Data comes from [DESC15] and can be obtained from http://www.cs.cf.ac.uk/semanticspaces/.
* Download everything there, and make the directory-structure look like this:
```
movies
classesGenres
classesKeywords
classesRatings
d20
DirectionsHeal
clusters20.txt
films20.mds
films20.projected
projections20.data
d50
...
d100
...
d200
...
Tokens
filmNames.txt
tokens.json
wines
classes
d20
...
d50
...
d100
...
d200
...
Tokens
wineNames.txt
places
...
```
## Contributing

### Set up development-environment
Expand All @@ -11,4 +50,6 @@ nbdime config-git --enable --global

### Set up Sacred

See https://sacred.readthedocs.io/en/stable/examples.html#docker-setup for the easiest way to get the MongoDB and boards to run. The */docker*-directory here is a clone of the respective *examples*-directory from the sacred-repo. To have the same `.env`-file in your local files, I can recommend PyCharm's **EnvFile**-Plugin.
See https://sacred.readthedocs.io/en/stable/examples.html#docker-setup for the easiest way to get the MongoDB and boards to run. The */docker*-directory here is a clone of the respective *examples*-directory from the sacred-repo. To have the same `.env`-file in your local files, I can recommend PyCharm's **EnvFile**-Plugin.


8 changes: 7 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ seaborn
ipyparams
nbdime
#nbdime config-git --enable --global
langdetect
google-cloud-translate==2.0.1

python-Levenshtein #für data_exploration.ipynb in data
python-dotenv

#sacred
sacred
pymongo
incense
incense

#model-downloader
#how to install seafile-cli on your system: see https://download.seafile.com/published/seafile-user-manual/syncing_client/install_linux_client.md
git+https://github.com/cstenkamp/python-seafile.git@v0.1.2#egg=python_seafile #on pypi there's only v0.1.0, which is broken, and even the original repo has an error with binary files
50 changes: 43 additions & 7 deletions scripts/create_siddata_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
available at http://www.cs.cf.ac.uk/semanticspaces/. Meaning: MDS, ..."""

#TODO make (snakemake?) Pipeline that runs start to finish and creates the complete directory

import hashlib
from os.path import join, isfile, dirname, basename
import re
import random
Expand All @@ -11,12 +11,18 @@

import numpy as np
import pandas as pd

from src.static.settings import SID_DATA_BASE, DEBUG, RANDOM_SEED, DATA_BASE
from main.util.logging import setup_logging
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm
import html
import json

from src.static.settings import SID_DATA_BASE, DEBUG, RANDOM_SEED, SPACES_DATA_BASE
from src.main.util.logutils import setup_logging
from src.main.util.pretty_print import pretty_print as print
from src.main.load_data.siddata_data_prep.create_mds import preprocess_data
from src.main.load_data.siddata_data_prep.jsonloadstore import json_dump, json_load
from src.main.util.google_translate import translate_text

logger = logging.getLogger(basename(__file__))

Expand All @@ -25,16 +31,46 @@
def main():
setup_logging("INFO")
random.seed(RANDOM_SEED)
for n_dims in [20, 100]: #[20,50,100,200]: #TODO #PRECOMMIT
create_dataset(n_dims, "courses")
# for n_dims in [20, 100]: #[20,50,100,200]: #TODO #PRECOMMIT
# create_dataset(n_dims, "courses")
translate_descriptions()


def translate_descriptions():
names, descriptions, mds = load_mds(join(SID_DATA_BASE, f"siddata_names_descriptions_mds_20.json"))
assert len(set(names)) == len(names)
descriptions = [html.unescape(i) for i in descriptions]
name_desc = dict(zip(names, descriptions))
if isfile((translationsfile := join(SID_DATA_BASE, "translated_descriptions.json"))):
with open(translationsfile, "r") as rfile:
translateds = json.load(rfile)
else:
translateds = {}
unknown = {}
print("Checking Language of descriptions...")
for name, desc in tqdm(name_desc.items()):
if name not in translateds:
try:
if (lan := detect(desc)) != "en":
unknown[name] = [desc, lan]
except LangDetectException as e:
unknown[name] = [desc, "unk"]
print(f"There are {len(''.join([i[0] for i in unknown.values()]))} signs to be translated.")
to_translate = [i for i in unknown.keys() if i not in translateds]
translations = translate_text([unknown[i][0] for i in to_translate])
# hash_translates = dict(zip([hashlib.sha256(i.encode("UTF-8")).hexdigest() for i in to_translate], translations))
translateds.update(dict(zip(to_translate, translations)))
with open(join(SID_DATA_BASE, "translated_descriptions.json"), "w") as wfile:
json.dump(translateds, wfile)
print()


def create_dataset(n_dims, dsetname):
# assert not DEBUG #TODO #PRECOMMIT
names, descriptions, mds = create_mds(join(SID_DATA_BASE, f"siddata_names_descriptions_mds_{n_dims}.json"), n_dims=n_dims)
# names, descriptions, mds = load_mds(join(SID_DATA_BASE, f"siddata_names_descriptions_mds_{n_dims}.json")) #TODO #PRECOMMIT comment out other line
display_mds(mds, names)
fname = join(DATA_BASE, dsetname, f"d{n_dims}", f"{dsetname}{n_dims}.mds")
fname = join(SPACES_DATA_BASE, dsetname, f"d{n_dims}", f"{dsetname}{n_dims}.mds")
os.makedirs(dirname(fname), exist_ok=True)
embedding = list(mds.embedding_)
# indices = np.argsort(np.array(names))
Expand Down
12 changes: 12 additions & 0 deletions scripts/derive_conceptualspaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Section 4.2 in DESC15"""
from src.static.settings import SPACES_DATA_BASE, DATA_SET, MDS_DIMENSIONS
from src.main.load_data.load_semanticspaces import load_mds_representation, get_names, get_grouped_candidates

def main():
mds, mds_path = load_mds_representation(SPACES_DATA_BASE, DATA_SET, MDS_DIMENSIONS)
names, names_path = get_names(SPACES_DATA_BASE, DATA_SET)
candidates, group_vectors = get_grouped_candidates(SPACES_DATA_BASE, DATA_SET, MDS_DIMENSIONS)
print()

if __name__ == '__main__':
main()
8 changes: 4 additions & 4 deletions scripts/display_courses_betweenness.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from os.path import join

from src.main.load_data.load_semanticspaces import load_mds_representation, get_names
from src.static.settings import DATA_BASE, DATA_SET, MDS_DIMENSIONS
from src.static.settings import SPACES_DATA_BASE, MDS_DIMENSIONS
from src.main.measures import between_a
from main.util.logging import setup_logging
from src.main.util.logutils import setup_logging
from src.test.test_semanticspaces_measures import find_betweenness_position

SOME_IDS = {"Computer Vision": 4155, "Computergrafik": 547, "Computergrafikpraktikum": 453, "Machine Learning": 1685, "Rechnernetzepraktikum": 1921}
Expand All @@ -22,8 +22,8 @@ def get_descriptions():


def show_betwennesses():
mds = load_mds_representation(DATA_BASE, DATA_SET, MDS_DIMENSIONS)[0]
names = get_names(DATA_BASE, DATA_SET)[0]
mds = load_mds_representation(SPACES_DATA_BASE, "courses", MDS_DIMENSIONS)[0]
names = get_names(SPACES_DATA_BASE, "courses")[0]
name_mds = dict(zip(names, mds))
candidates = [("Computergrafik", "Computer Vision", "Machine Learning"), ("Rechnernetzepraktikum", "Computergrafik", "Computergrafikpraktikum")]
descriptions = get_descriptions()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"import plotly.graph_objects as go\n",
"import pandas as pd\n",
"\n",
"from src.static.settings import MONGO_URI, ENV_FILE_PATH, DATA_BASE\n",
"from src.static.settings import MONGO_URI, ENV_FILE_PATH, SPACES_DATA_BASE\n",
"from src.main.load_data.load_semanticspaces import load_mds_representation, get_names, get_classes\n",
"from scripts.create_siddata_dataset import display_mds #TODO display in scripts?!"
]
Expand Down Expand Up @@ -83,12 +83,12 @@
" latest_tsne = [i for i in tsne_exps if i.status == \"COMPLETED\" and i.config.get(\"tsne_dims\") == tsne_dim and i.config.get(\"mds_dimensions\") == mds_dim and i.config.get(\"data_set\") == data_set][-1]\n",
"\n",
" tsne_arr = pd.read_csv(StringIO(latest_tsne.artifacts[\"tSNE\"].content.decode(\"UTF-8\")))\n",
" classes = get_classes(DATA_BASE, data_set, what=cat_name)\n",
" classes = get_classes(SPACES_DATA_BASE, data_set, what=cat_name)\n",
" tsne_arr[\"Category\"] = [val[0] if val else \"None\" for key, val in classes.items()]\n",
"\n",
" #TODO ACTUALLY these should be loaded from the experiment as well, but add_resource doesn't let me provide names\n",
" mds, _ = load_mds_representation(DATA_BASE, data_set, mds_dim)\n",
" names, _ = get_names(DATA_BASE, data_set)\n",
" mds, _ = load_mds_representation(SPACES_DATA_BASE, data_set, mds_dim)\n",
" names, _ = get_names(SPACES_DATA_BASE, data_set)\n",
"\n",
" display_mds(mds, names, max_elems=2)\n",
"\n",
Expand Down
6 changes: 3 additions & 3 deletions scripts/sacred/create_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.manifold import TSNE

from os.path import join
from src.static.settings import DATA_BASE, DATA_DUMP_DIR, MONGO_URI
from src.static.settings import SPACES_DATA_BASE, DATA_DUMP_DIR, MONGO_URI
from src.main.load_data.load_semanticspaces import load_mds_representation, get_names

########################################################################################################################
Expand Down Expand Up @@ -40,8 +40,8 @@ def make_tsne_df(mds, names, n_dims=3):
def main(mds_dimensions, data_set, tsne_dims):
exp_inf_str = "__".join([f"{key}_{val}" for key, val in cfg().items()])
dump_name = join(DATA_DUMP_DIR, f"tsne_{exp_inf_str}.csv")
mds, mds_path = load_mds_representation(DATA_BASE, data_set, mds_dimensions)
names, names_path = get_names(DATA_BASE, data_set)
mds, mds_path = load_mds_representation(SPACES_DATA_BASE, data_set, mds_dimensions)
names, names_path = get_names(SPACES_DATA_BASE, data_set)
ex.add_resource(mds_path)
ex.add_resource(names_path)
df = make_tsne_df(mds, names, tsne_dims)
Expand Down
15 changes: 15 additions & 0 deletions scripts/upload_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
from src.main.util.model_downloader_seafile import get_write_account_data, SeafileModelSyncer, model_downloader_logger
from src.static import settings

model_downloader_logger.setLevel("INFO")
localpath = settings.DATA_BASE
account, password, server, repoid, repopath, modelversions = get_write_account_data()
modelsyncer = SeafileModelSyncer(server, account, password, repoid, repopath)
if modelsyncer.repo is not None:
print("Do you really want to upload the following:")
for mname, mversion in modelversions.items():
if mversion is not None:
print(f"{mname} in version {mversion}")
if input("? [y/n]").lower() == "y":
modelsyncer.upload_modeldirs(localpath, modelversions, overwrite_version=False)
4 changes: 2 additions & 2 deletions src/main/load_data/dataset_specifics/courses.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ def get_classes(data_base, what):


if __name__ == "__main__":
from src.static.settings import DATA_BASE
tmp = get_classes(DATA_BASE, "Fachbereich")
from src.static.settings import SPACES_DATA_BASE
tmp = get_classes(SPACES_DATA_BASE, "Fachbereich")
print(tmp)
15 changes: 12 additions & 3 deletions src/main/load_data/dataset_specifics/movies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy as np

from main.load_data.load_semanticspaces import get_names
from src.main.load_data.load_semanticspaces import get_names

ORDER = ['Musical', 'Music', 'Documentary', 'Western', 'Animation', 'War', 'History', 'Sci-Fi', 'Horror', 'Sport', 'Biography', 'Film-Noir', 'News', 'Fantasy', 'Adult', 'Crime', 'Thriller', 'Comedy', 'Romance', 'Action', 'Mystery', 'Adventure', 'Drama', 'Family', 'Short']
#this order is supposed to be roughly sorted by informativeness, such that a movie that is both "Musical" and "Family", if only one label is supposed to be picked, will rather be the more informative "Musical"
Expand All @@ -21,8 +21,17 @@ def get_classes(data_base, what):
return classes


def get_candidateterms(data_base, data_set, n_dims, **kwargs):
dir = join(data_base, data_set, f"d{n_dims}", "DirectionsHeal")
vecnames = [i for i in os.listdir(dir) if i.endswith(".vector")]
vectors = [np.loadtxt(join(dir, i)) for i in vecnames]
vecnames = [i[:-len(".vector")] for i in vecnames]
vecnames2 = [list(zip(*[j.split("_") for j in i.split(" ")])) for i in vecnames]
words, poss = [" ".join(i[0]) for i in vecnames2], [" ".join(i[1]) if len(i) > 1 else None for i in vecnames2]
return words, poss, vectors, vecnames


if __name__ == "__main__":
from src.static.settings import DATA_BASE
tmp = get_classes(DATA_BASE, "Genres")
from src.static.settings import SPACES_DATA_BASE
tmp = get_classes(SPACES_DATA_BASE, "Genres")
print(tmp)
Loading

0 comments on commit 8c3bf93

Please sign in to comment.