diff --git a/.gitignore b/.gitignore index d21474f..1616478 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ venv/* .idea/* webapp.log static/topicmodeling.zip +__pycache__/* +Pipfile.lock diff --git a/Pipfile b/Pipfile index f46a1ce..da55c7c 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ Flask = "==0.12.2" lxml = "==4.1.1" pandas = "==0.21.1" numpy = "==1.14.0" +pyqt = "==5.9.2" [dev-packages] diff --git a/README.md b/README.md index f4a7561..06348d0 100755 --- a/README.md +++ b/README.md @@ -1,25 +1,24 @@ # Topics Explorer: A GUI for Topics – Easy Topic Modeling This application introduces an user-friendly Topic Modeling workflow, basically containing text data preprocessing, the actual modeling using [latent Dirichlet allocation](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) (LDA), as well as various interactive visualizations. -**If you do not know anything about Topic Modeling or programming in general, this is where you start.** +If you do not know anything about Topic Modeling or programming in general, this is where you start. -**Topics Explorer** aims for *simplicity* and *usability*. If you are working with a large corpus (let's say more than 200 documents, 5000 tokens each document) you may wish to use more sophisticated Topic Models such as those implemented in [MALLET](http://mallet.cs.umass.edu/topics.php), which is known to be more robust than standard LDA. Have a look at our Jupyter notebook [introducing Topic Modeling with MALLET](https://github.com/DARIAH-DE/Topics/IntroducingMallet.ipynb). - -![Demonstrator Screenshot](screenshot.png) +## Getting started with the standalone executable +You **do not** have to install a Python interpreter or anything else. There is currently one standalone build for Windows and macOS, respectively. **At the moment, Linux user will have to use the development version**. +1. Go to the [release-section](https://github.com/DARIAH-DE/TopicsExplorer/releases) and download the ZIP archive for your OS. +2. Open it by double-clicking. +3. Run the app by double-clicking the file `DARIAH Topics Explorer`. (The files in the folder `src` is basically source code. You do not need to worry about that). -## Getting started -Although this application is built with Python and some JavaScript, it is possible to run it as if it was a native application, without having to install Python or any related packages. There is currently one build for Windows and macOS, respectively. +**Topics Explorer** aims for simplicity and usability. If you are working with a large corpus (let's say more than 200 documents, 5000 tokens each document) you may wish to use more sophisticated topic models such as those implemented in [MALLET](http://mallet.cs.umass.edu/topics.php), which is known to be more robust than standard LDA. Have a look at our Jupyter notebook [introducing Topic Modeling with MALLET](https://github.com/DARIAH-DE/Topics/blob/master/IntroducingMallet.ipynb). -1. Download `demonstrator-0.0.1-windows.zip` or `demonstrator-0.0.1-mac.zip` from the [release-section](https://github.com/DARIAH-DE/Topics/releases). -2. Open it by double-clicking. -3. Run the app by double-clicking the file `DARIAH Topics Explorer.exe` or `DARIAH Topics Explorer.app`, respectively. +![Demonstrator Screenshot](screenshot.png) ### Troubleshooting * Please be patient. Depending on corpus size and number of iterations, the process may take some time, meaning something between some seconds and some hours. * If you are on a Mac and get an error message saying that the file is from an “unidentified developer”, you can override it by holding control while double-clicking. The error message will still appear, but you will be given an option to run the file anyway. -* Please use [GitHub Issues](https://github.com/DARIAH-DE/TopicsExplorer/issues). +* Please use [GitHub issues](https://github.com/DARIAH-DE/TopicsExplorer/issues). ## Working with the development version @@ -32,18 +31,17 @@ Although this application is built with Python and some JavaScript, it is possib ### Requirements Besides the standalone executables, you have the ability to run the development version. In this case, you will have to install some dependencies, but first of all: - * At least Python 3.6, from [here](https://www.python.org/downloads/). Python 2 is *not* supported. -* If you wish to use *Layer 3* (which is not necessary at all): Node.js, from [here](https://nodejs.org/en/download/). -For Python, you will need the following libraries: -* [`dariah_topics`](https://github.com/DARIAH-DE/Topics) 0.0.5. -* [`lda`](https://github.com/lda-project/lda) 1.0.5. -* [`bokeh`](https://github.com/bokeh/bokeh) 0.12.13. -* [`flask`](https://github.com/pallets/flask) 0.12.2. -* [`lxml`](https://github.com/lxml/lxml) 4.1.1. -* [`pandas`](https://github.com/pandas-dev/pandas) 0.21.1. -* [`numpy`](https://github.com/numpy/numpy) 1.14.0. +You will need the following libraries: +* [`dariah_topics`](https://github.com/DARIAH-DE/Topics) 0.0.6 +* [`lda`](https://github.com/lda-project/lda) 1.0.5 +* [`bokeh`](https://github.com/bokeh/bokeh) 0.12.13 +* [`flask`](https://github.com/pallets/flask) 0.12.2 +* [`lxml`](https://github.com/lxml/lxml) 4.1.1 +* [`pandas`](https://github.com/pandas-dev/pandas) 0.21.1 +* [`numpy`](https://github.com/numpy/numpy) 1.14.0 +* [`pyqt5`](https://github.com/baoboa/pyqt5) 5.9.2. You can install all dependencies using [`pipenv`](http://pipenv.readthedocs.io/en/latest/): @@ -51,50 +49,32 @@ You can install all dependencies using [`pipenv`](http://pipenv.readthedocs.io/e pipenv install ``` -> If you are on a UNIX-based machine, remember using `pip3` and `python3` instead of `pip` and `python`. - -So far, you could run the application via `python webapp.py` and go to `http://127.0.0.1:5000` in any web browser. If you want a more desktop app-like feeling, you can build *Layer 3* on top with [Electron](https://electronjs.org/), a JavaScript framework for creating native applications with web technologies like JavaScript, HTML, and CSS. The dependencies are: - -* [`electron`](https://github.com/electron/electron) 1.7.10. -* [`request-promise`](https://github.com/request/request-promise) 4.2.2. -* [`request`](https://github.com/request/request) 2.83.1. +> If you are on a UNIX-based machine, remember using `python3` instead of `python`. -Run the following command via [`npm`](https://www.npmjs.com/get-npm): +So far, you could run the application via `python webapp.py` and go to `http://127.0.0.1:5000` in any web browser. If you want a more desktop app-like feeling, you can build *Layer 3* on top and run: ``` -npm install +python topicsexplorer.py ``` + ### Contents -* [`bokeh_templates`](bokeh_templates): HTML templates for `bokeh`. This is only relevant, if you want to freeze the Python part with `pyinstaller`. -* [`hooks`](hooks): Necessary hook files. This is only relevant, if you want to freeze the Python part with `pyinstaller`. -* [`main.js`](main.js): Basically the GUI. -* [`package.json`](package.json): Metadata, dependencies, and scripts for the GUI. +* [`bokeh_templates`](bokeh_templates): HTML templates for `bokeh`. This is only relevant, if you want to freeze the scripts with PyInstaller. +* [`hooks`](hooks): Necessary hook files. This is only relevant, if you want to freeze the Python part with PyInstaller. * [`static`](static) and [`templates`](templates): Static files (e.g. images, CSS, etc.) and HTML templates for the `flask` template engine. * [`test`](test): Unittest for `webapp.py`, testing all functions of the application. * [`webapp.py`](webapp.py): Contains 3rd party functions and communicates with the webserver. -* [`webapp.spec`](webapp.spec): The build script for `pyinstaller` containing metadata. +* [`topicsexplorer.py`](topicsexplorer.py): A Qt-based UI displaying the contents of the app by running `webapp.py`. +* [`topicsexplorer.spec`](webapp.spec): The build script for PyInstaller containing metadata. ### Troubleshooting -* When installing `electron` fails, try `sudo npm install -g electron --unsafe-perm=true --allow-root`. -* Please use [GitHub Issues](https://github.com/DARIAH-DE/TopicsExplorer/issues). - +* Please use [GitHub issues](https://github.com/DARIAH-DE/TopicsExplorer/issues). -## Creating a build for Layer 1 and 2 -To freeze the Python part with `pyinstaller`, run on macOS: -``` -pyinstaller --onefile --add-data static:static --add-data templates:templates --add-data bokeh_templates:bokeh_templates --additional-hooks-dir hooks webapp.py -``` - -or, for Windows: -``` -pyinstaller --onefile --add-data static;static --add-data templates;templates --add-data bokeh_templates;bokeh_templates --additional-hooks-dir hooks webapp.py -``` -## Creating a build for the whole application -To freeze the Electron part with `electron-builder`, run: +## Creating a standalone build +To freeze the Python scripts with [PyInstaller](http://www.pyinstaller.org/), simply run: ``` -electron-builder +pyinstaller topicsexplorer.spec ``` diff --git a/bokeh_templates/autoload_js.js b/bokeh_templates/autoload_js.js index 2ee6886..b099055 100755 --- a/bokeh_templates/autoload_js.js +++ b/bokeh_templates/autoload_js.js @@ -1,54 +1,43 @@ {# - Renders JavaScript code - for "autoloading". +Renders JavaScript code for "autoloading". - The code automatically and asynchronously loads BokehJS( - if necessary) and - then replaces the AUTOLOAD_TAG `` < script > `` - tag that - calls it with the rendered model. +The code automatically and asynchronously loads BokehJS (if necessary) and +then replaces the AUTOLOAD_TAG `` +#} + diff --git a/bokeh_templates/css_resources.html b/bokeh_templates/css_resources.html index 15a000c..b45979d 100755 --- a/bokeh_templates/css_resources.html +++ b/bokeh_templates/css_resources.html @@ -1,24 +1,20 @@ -{# Renders HTML that loads Bokeh CSS according to the configuration in a Resources object. :param css_files: a list of URIs for CSS files to include :type css_files: list[str] :param css_raw: a list of raw CSS snippets to put between `` - {%- endfor %} diff --git a/bokeh_templates/doc_js.js b/bokeh_templates/doc_js.js index adcdf71..1362479 100755 --- a/bokeh_templates/doc_js.js +++ b/bokeh_templates/doc_js.js @@ -1,23 +1,7 @@ -{ % extends "try_run.js" % -} +{% extends "try_run.js" %} -{ % block code_to_run % -} -var docs_json = { - { - docs_json - } -}; -var render_items = { - { - render_items - } -}; -root.Bokeh.embed.embed_items(docs_json, render_items { % - - if app_path - % -}, "{{ app_path }}" { % -endif - % -} { % - - if absolute_url - % -}, "{{ absolute_url }}" { % -endif - % -}); { % endblock % -} +{% block code_to_run %} + var docs_json = {{ docs_json }}; + var render_items = {{ render_items }}; + root.Bokeh.embed.embed_items(docs_json, render_items{%- if app_path -%}, "{{ app_path }}" {%- endif -%}{%- if absolute_url -%}, "{{ absolute_url }}" {%- endif -%}); +{% endblock %} diff --git a/bokeh_templates/doc_nb_js.js b/bokeh_templates/doc_nb_js.js index c46cc32..a4f44bc 100755 --- a/bokeh_templates/doc_nb_js.js +++ b/bokeh_templates/doc_nb_js.js @@ -1,17 +1,7 @@ -{ % extends "try_run.js" % -} +{% extends "try_run.js" %} -{ % block code_to_run % -} -var docs_json = { - { - docs_json - } -}; -var render_items = { - { - render_items - } -}; -root.Bokeh.embed.embed_items_notebook(docs_json, render_items); { % endblock % -} +{% block code_to_run %} + var docs_json = {{ docs_json }}; + var render_items = {{ render_items }}; + root.Bokeh.embed.embed_items_notebook(docs_json, render_items); +{% endblock %} diff --git a/bokeh_templates/file.html b/bokeh_templates/file.html index 2e57b22..34891ca 100755 --- a/bokeh_templates/file.html +++ b/bokeh_templates/file.html @@ -1,29 +1,43 @@ -{# Renders Bokeh models into a basic .html file. :param title: value for `` -`` tags :type title: str :param plot_resources: typically the output of RESOURCES :type plot_resources: str :param plot_script: typically the output of PLOT_SCRIPT :type plot_script: str :param plot_div: typically the output of PLOT_DIV :type plot_div: - str Users can customize the file output by providing their own Jinja2 template that accepts these same parameters. #} - <!DOCTYPE html> - <html lang="en"> +{# +Renders Bokeh models into a basic .html file. - <head> - <meta charset="utf-8"> - <title>{{ title|e if title else "Bokeh Plot" }} - {{ bokeh_css }} {{ bokeh_js }} - - +:param plot_resources: typically the output of RESOURCES +:type plot_resources: str - - {{ plot_div|indent(8) }} {{ plot_script|indent(8) }} - +:param plot_script: typically the output of PLOT_SCRIPT +:type plot_script: str - +:param plot_div: typically the output of PLOT_DIV +:type plot_div: str + +Users can customize the file output by providing their own Jinja2 template +that accepts these same parameters. + +#} + + + + + {{ title|e if title else "Bokeh Plot" }} + {{ bokeh_css }} + {{ bokeh_js }} + + + + {{ plot_div|indent(8) }} + {{ plot_script|indent(8) }} + + diff --git a/bokeh_templates/js_resources.html b/bokeh_templates/js_resources.html index 244834b..014740d 100755 --- a/bokeh_templates/js_resources.html +++ b/bokeh_templates/js_resources.html @@ -1,27 +1,20 @@ -{# Renders HTML that loads BokehJS JavaScript code and CSS according to the configuration in a Resources object. :param js_files: a list of URIs for JS files to include :type js_files: list[str] :param js_raw: a list of raw JS snippets to put between -`` - -

Using Settings:

- - - - - - - - - - - - - - - -
Bokehversion{{ bokeh_version }}
BokehJSjs{{ js_info }}
css{{ css_info }}
-{%- endif %} {%- for warning in warnings %} -

{{ warning }}

-{%- endfor %} +:param js_info: information about the location, version, etc. of BokehJS code +:type js_info: str + +:param css_info: information about the location, version, etc. of BokehJS css +:type css_info: str + +:param warnings: a list of warnings to display to user +:type warnings: list[str] + +#} +
+ + Loading BokehJS ... +
+ {%- if verbose %} + +

Using Settings:

+ + + + + + + + + + + + + + + +
Bokehversion{{ bokeh_version }}
BokehJSjs{{ js_info }}
css{{ css_info }}
+ {%- endif %} + {%- for warning in warnings %} +

{{ warning }}

+ {%- endfor %} diff --git a/bokeh_templates/plot_div.html b/bokeh_templates/plot_div.html index 222d7ec..a1cf3e3 100755 --- a/bokeh_templates/plot_div.html +++ b/bokeh_templates/plot_div.html @@ -1,5 +1,11 @@ -{# Renders a basic plot div, that can be used in conjunction with PLOT_JS. :param elementid: a unique identifier for the `` -
`` a PLOT_JS template should be configured with the same ``elementid`` :type elementid: str #} -
+{# +Renders a basic plot div, that can be used in conjunction with PLOT_JS. + +:param elementid: a unique identifier for the ``
`` a PLOT_JS + template should be configured with the same ``elementid`` +:type elementid: str + +#} +
-
+
diff --git a/bokeh_templates/render_css.txt b/bokeh_templates/render_css.txt new file mode 100755 index 0000000..12f7135 --- /dev/null +++ b/bokeh_templates/render_css.txt @@ -0,0 +1,22 @@ + + + + \ No newline at end of file diff --git a/bokeh_templates/render_js.txt b/bokeh_templates/render_js.txt new file mode 100755 index 0000000..5eb00ef --- /dev/null +++ b/bokeh_templates/render_js.txt @@ -0,0 +1,270 @@ + + + + + + \ No newline at end of file diff --git a/bokeh_templates/script_tag.html b/bokeh_templates/script_tag.html index 98ea55c..bbe2e4b 100755 --- a/bokeh_templates/script_tag.html +++ b/bokeh_templates/script_tag.html @@ -1,20 +1,10 @@ -{# Renders a `` - diff --git a/bokeh_templates/try_run.js b/bokeh_templates/try_run.js index 96d6ead..2d76703 100755 --- a/bokeh_templates/try_run.js +++ b/bokeh_templates/try_run.js @@ -1,8 +1,7 @@ (function(root) { function embed_document(root) { - { % block code_to_run % - } { % endblock % - } + {% block code_to_run %} + {% endblock %} } if (root.Bokeh !== undefined) { embed_document(root); diff --git a/layer.png b/layer.png index ec6c5d0..661e4c6 100644 Binary files a/layer.png and b/layer.png differ diff --git a/screenshot.png b/screenshot.png index f2e9c6f..114cf76 100644 Binary files a/screenshot.png and b/screenshot.png differ diff --git a/stopwords/de.txt b/stopwords/de.txt new file mode 100644 index 0000000..19a248d --- /dev/null +++ b/stopwords/de.txt @@ -0,0 +1,618 @@ +a +ab +aber +aber +ach +acht +achte +achten +achter +achtes +ag +alle +allein +allem +allen +aller +allerdings +alles +allgemeinen +als +als +also +am +an +andere +anderen +andern +anders +au +auch +auch +auf +aus +ausser +außer +ausserdem +außerdem +b +bald +bei +beide +beiden +beim +beispiel +bekannt +bereits +besonders +besser +besten +bin +bis +bisher +bist +c +d +da +dabei +dadurch +dafür +dagegen +daher +dahin +dahinter +damals +damit +danach +daneben +dank +dann +daran +darauf +daraus +darf +darfst +darin +darüber +darum +darunter +das +das +dasein +daselbst +dass +daß +dasselbe +davon +davor +dazu +dazwischen +dein +deine +deinem +deiner +dem +dementsprechend +demgegenüber +demgemäss +demgemäß +demselben +demzufolge +den +denen +denn +denn +denselben +der +deren +derjenige +derjenigen +dermassen +dermaßen +derselbe +derselben +des +deshalb +desselben +dessen +deswegen +d.h +dich +die +diejenige +diejenigen +herrn +se +blos +selber +freilich +hr +hrn +jörgel +hans +dies +diese +dieselbe +dieselben +diesem +diesen +dieser +dieses +dir +doch +dort +drei +drin +dr +de +ans +anch +gt +herr +dritte +dritten +dritter +drittes +du +durch +durchaus +dürfen +dürft +durfte +durften +e +eben +ebenso +ehrlich +ei +ei, +ei, +eigen +eigene +eigenen +eigener +eigenes +ein +einander +eine +einem +einen +einer +eines +einige +einigen +einiger +einiges +einmal +einmal +eins +elf +en +ende +endlich +entweder +entweder +er +Ernst +erst +erste +ersten +erster +erstes +es +etwa +etwas +euch +f +früher +fünf +fünfte +fünften +fünfter +fünftes +für +g +gab +ganz +ganze +ganzen +ganzer +ganzes +gar +gedurft +gegen +gegenüber +gehabt +gehen +geht +gekannt +gekonnt +gemacht +gemocht +gemusst +genug +gerade +gern +gesagt +gesagt +geschweige +gewesen +gewollt +geworden +gibt +ging +gleich +gott +gross +groß +grosse +große +grossen +großen +grosser +großer +grosses +großes +gut +gute +guter +gutes +h +habe +haben +habt +hast +hat +hatte +hätte +hatten +hätten +heisst +her +heute +hier +hin +hinter +hoch +i +ich +ihm +ihn +ihnen +ihr +ihre +ihrem +ihren +ihrer +ihres +im +im +immer +in +in +indem +infolgedessen +ins +irgend +ist +j +ja +ja +jahr +jahre +jahren +je +jede +jedem +jeden +jeder +jedermann +jedermanns +jedoch +jemand +jemandem +jemanden +jene +jenem +jenen +jener +jenes +jetzt +k +kam +kann +kannst +kaum +kein +keine +keinem +keinen +keiner +kleine +kleinen +kleiner +kleines +kommen +kommt +können +könnt +konnte +könnte +konnten +kurz +l +lang +lange +lange +leicht +leide +lieber +los +m +machen +macht +machte +mag +magst +mahn +man +manche +manchem +manchen +mancher +manches +mann +mehr +mein +meine +meinem +meinen +meiner +meines +mensch +menschen +mich +mir +mit +mittel +mochte +möchte +mochten +mögen +möglich +mögt +morgen +muss +muß +müssen +musst +müsst +musste +mussten +n +na +nach +nachdem +nahm +natürlich +neben +nein +neue +neuen +neun +neunte +neunten +neunter +neuntes +nicht +nicht +nichts +nie +niemand +niemandem +niemanden +noch +nun +nun +nur +o +ob +ob +oben +oder +oder +offen +oft +oft +ohne +Ordnung +p +q +r +recht +rechte +rechten +rechter +rechtes +richtig +rund +s +sa +sache +sagt +sagte +sah +satt +schlecht +Schluss +schon +sechs +sechste +sechsten +sechster +sechstes +sehr +sei +sei +seid +seien +sein +seine +seinem +seinen +seiner +seines +seit +seitdem +selbst +selbst +sich +sie +sieben +siebente +siebenten +siebenter +siebentes +sind +so +solang +solche +solchem +solchen +solcher +solches +soll +sollen +sollte +sollten +sondern +sonst +sowie +später +statt +t +tag +tage +tagen +tat +teil +tel +tritt +trotzdem +tun +u +über +überhaupt +übrigens +uhr +um +und +und? +uns +unser +unsere +unserer +unter +v +vergangenen +viel +viele +vielem +vielen +vielleicht +vier +vierte +vierten +vierter +viertes +vom +von +vor +w +wahr? +während +währenddem +währenddessen +wann +war +wäre +waren +wart +warum +was +wegen +weil +weit +weiter +weitere +weiteren +weiteres +welche +welchem +welchen +welcher +welches +wem +wen +wenig +wenig +wenige +weniger +weniges +wenigstens +wenn +wenn +wer +werde +werden +werdet +wessen +wie +wie +wieder +will +willst +wir +wird +wirklich +wirst +wo +wohl +wollen +wollt +wollte +wollten +worden +wurde +würde +wurden +würden +x +y +z +z.b +zehn +zehnte +zehnten +zehnter +zehntes +zeit +zu +zuerst +zugleich +zum +zum +zunächst +zur +zurück +zusammen +zwanzig +zwar +zwar +zwei +zweite +zweiten +zweiter +zweites +zwischen +zwölf diff --git a/stopwords/en.txt b/stopwords/en.txt new file mode 100755 index 0000000..9b1d502 --- /dev/null +++ b/stopwords/en.txt @@ -0,0 +1,671 @@ +a +able +about +above +abst +accordance +according +accordingly +across +act +actually +added +adj +adopted +affected +affecting +affects +after +afterwards +again +against +ah +all +almost +alone +along +already +also +although +always +am +among +amongst +an +and +announce +another +any +anybody +anyhow +anymore +anyone +anything +anyway +anyways +anywhere +apparently +approximately +are +aren +arent +arise +around +as +aside +ask +asking +at +auth +available +away +awfully +b +back +be +became +because +become +becomes +becoming +been +before +beforehand +begin +beginning +beginnings +begins +behind +being +believe +below +beside +besides +between +beyond +biol +both +brief +briefly +but +by +c +ca +came +can +cannot +can't +cause +causes +certain +certainly +co +com +come +comes +contain +containing +contains +could +couldnt +d +date +did +didn't +different +do +does +doesn't +doing +done +don't +down +downwards +due +during +e +each +ed +edu +effect +eg +eight +eighty +either +else +elsewhere +end +ending +enough +especially +et +et-al +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +except +f +far +few +ff +fifth +first +five +fix +followed +following +follows +for +former +formerly +forth +found +four +from +further +furthermore +g +gave +get +gets +getting +give +given +gives +giving +go +goes +gone +got +gotten +h +had +happens +hardly +has +hasn't +have +haven't +having +he +hed +hence +her +here +hereafter +hereby +herein +heres +hereupon +hers +herself +hes +hi +hid +him +himself +his +hither +home +how +howbeit +however +hundred +i +id +ie +if +i'll +im +immediate +immediately +importance +important +in +inc +indeed +index +information +instead +into +invention +inward +is +isn't +it +itd +it'll +its +itself +i've +j +just +k +keep +keeps +kept +keys +kg +km +know +known +knows +l +largely +last +lately +later +latter +latterly +least +less +lest +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +m +made +mainly +make +makes +many +may +maybe +me +mean +means +meantime +meanwhile +merely +mg +might +million +miss +ml +more +moreover +most +mostly +mr +mrs +much +mug +must +my +myself +n +na +name +namely +nay +nd +near +nearly +necessarily +necessary +need +needs +neither +never +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +nor +normally +nos +not +noted +nothing +now +nowhere +o +obtain +obtained +obviously +of +off +often +oh +ok +okay +old +omitted +on +once +one +ones +only +onto +or +ord +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +owing +own +p +page +pages +part +particular +particularly +past +per +perhaps +placed +please +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +probably +promptly +proud +provides +put +q +que +quickly +quite +qv +r +ran +rather +rd +re +readily +really +recent +recently +ref +refs +regarding +regardless +regards +related +relatively +research +respectively +resulted +resulting +results +right +run +s +said +same +saw +say +saying +says +sec +section +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sent +seven +several +shall +she +shed +she'll +shes +should +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +since +six +slightly +so +some +somebody +somehow +someone +somethan +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specifically +specified +specify +specifying +state +states +still +stop +strongly +sub +substantially +successfully +such +sufficiently +suggest +sup +sure +t +take +taken +taking +tell +tends +th +than +thank +thanks +thanx +that +that'll +thats +that've +the +their +theirs +them +themselves +then +thence +there +thereafter +thereby +thered +therefore +therein +there'll +thereof +therere +theres +thereto +thereupon +there've +these +they +theyd +they'll +theyre +they've +think +this +those +thou +though +thoughh +thousand +throug +through +throughout +thru +thus +til +tip +to +together +too +took +toward +towards +tried +tries +truly +try +trying +ts +twice +two +u +un +under +unfortunately +unless +unlike +unlikely +until +unto +up +upon +ups +us +use +used +useful +usefully +usefulness +uses +using +usually +v +value +various +'ve +very +via +viz +vol +vols +vs +w +want +wants +was +wasn't +way +we +wed +welcome +we'll +went +were +weren't +we've +what +whatever +what'll +whats +when +whence +whenever +where +whereafter +whereas +whereby +wherein +wheres +whereupon +wherever +whether +which +while +whim +whither +who +whod +whoever +whole +who'll +whom +whomever +whos +whose +why +widely +willing +wish +with +within +without +won't +words +world +would +wouldn't +www +x +y +yes +yet +you +youd +you'll +your +youre +yours +yourself +yourselves +you've +z +zero diff --git a/stopwords/es.txt b/stopwords/es.txt new file mode 100755 index 0000000..33c1e6d --- /dev/null +++ b/stopwords/es.txt @@ -0,0 +1,351 @@ +él +ésta +éstas +éste +éstos +última +últimas +último +últimos +a +añadió +aún +actualmente +adelante +además +afirmó +agregó +ahí +ahora +al +algún +algo +alguna +algunas +alguno +algunos +alrededor +ambos +ante +anterior +antes +apenas +aproximadamente +aquí +así +aseguró +aunque +ayer +bajo +bien +buen +buena +buenas +bueno +buenos +cómo +cada +casi +cerca +cierto +cinco +comentó +como +con +conocer +consideró +considera +contra +cosas +creo +cual +cuales +cualquier +cuando +cuanto +cuatro +cuenta +da +dado +dan +dar +de +debe +deben +debido +decir +dejó +del +demás +dentro +desde +después +dice +dicen +dicho +dieron +diferente +diferentes +dijeron +dijo +dio +donde +dos +durante +e +ejemplo +el +ella +ellas +ello +ellos +embargo +en +encuentra +entonces +entre +era +eran +es +esa +esas +ese +eso +esos +está +están +esta +estaba +estaban +estamos +estar +estará +estas +este +esto +estos +estoy +estuvo +ex +existe +existen +explicó +expresó +fin +fue +fuera +fueron +gran +grandes +ha +había +habían +haber +habrá +hace +hacen +hacer +hacerlo +hacia +haciendo +han +hasta +hay +haya +he +hecho +hemos +hicieron +hizo +hoy +hubo +igual +incluso +indicó +informó +junto +la +lado +las +le +les +llegó +lleva +llevar +lo +los +luego +lugar +más +manera +manifestó +mayor +me +mediante +mejor +mencionó +menos +mi +mientras +misma +mismas +mismo +mismos +momento +mucha +muchas +mucho +muchos +muy +nada +nadie +ni +ningún +ninguna +ningunas +ninguno +ningunos +no +nos +nosotras +nosotros +nuestra +nuestras +nuestro +nuestros +nueva +nuevas +nuevo +nuevos +nunca +o +ocho +otra +otras +otro +otros +para +parece +parte +partir +pasada +pasado +pero +pesar +poca +pocas +poco +pocos +podemos +podrá +podrán +podría +podrían +poner +por +porque +posible +próximo +próximos +primer +primera +primero +primeros +principalmente +propia +propias +propio +propios +pudo +pueda +puede +pueden +pues +qué +que +quedó +queremos +quién +quien +quienes +quiere +realizó +realizado +realizar +respecto +sí +sólo +se +señaló +sea +sean +según +segunda +segundo +seis +ser +será +serán +sería +si +sido +siempre +siendo +siete +sigue +siguiente +sin +sino +sobre +sola +solamente +solas +solo +solos +son +su +sus +tal +también +tampoco +tan +tanto +tenía +tendrá +tendrán +tenemos +tener +tenga +tengo +tenido +tercera +tiene +tienen +toda +todas +todavía +todo +todos +total +tras +trata +través +tres +tuvo +un +una +unas +uno +unos +usted +va +vamos +van +varias +varios +veces +ver +vez +y +ya +yo diff --git a/stopwords/fr.txt b/stopwords/fr.txt new file mode 100755 index 0000000..d51add3 --- /dev/null +++ b/stopwords/fr.txt @@ -0,0 +1,463 @@ +a +à +â +abord +afin +ah +ai +aie +ainsi +allaient +allo +allô +allons +après +assez +attendu +au +aucun +aucune +aujourd +aujourd'hui +auquel +aura +auront +aussi +autre +autres +aux +auxquelles +auxquels +avaient +avais +avait +avant +avec +avoir +ayant +b +bah +beaucoup +bien +bigre +boum +bravo +brrr +c +ça +car +ce +ceci +cela +celle +celle-ci +celle-là +celles +celles-ci +celles-là +celui +celui-ci +celui-là +cent +cependant +certain +certaine +certaines +certains +certes +ces +cet +cette +ceux +ceux-ci +ceux-là +chacun +chaque +cher +chère +chères +chers +chez +chiche +chut +ci +cinq +cinquantaine +cinquante +cinquantième +cinquième +clac +clic +combien +comme +comment +compris +concernant +contre +couic +crac +d +da +dans +de +debout +dedans +dehors +delà +depuis +derrière +des +dès +désormais +desquelles +desquels +dessous +dessus +deux +deuxième +deuxièmement +devant +devers +devra +différent +différente +différentes +différents +dire +divers +diverse +diverses +dix +dix-huit +dixième +dix-neuf +dix-sept +doit +doivent +donc +dont +douze +douzième +dring +du +duquel +durant +e +effet +eh +elle +elle-même +elles +elles-mêmes +en +encore +entre +envers +environ +es +ès +est +et +etant +étaient +étais +était +étant +etc +été +etre +être +eu +euh +eux +eux-mêmes +excepté +f +façon +fais +faisaient +faisant +fait +feront +fi +flac +floc +font +g +gens +h +ha +hé +hein +hélas +hem +hep +hi +ho +holà +hop +hormis +hors +hou +houp +hue +hui +huit +huitième +hum +hurrah +i +il +ils +importe +j +je +jusqu +jusque +k +l +la +là +laquelle +las +le +lequel +les +lès +lesquelles +lesquels +leur +leurs +longtemps +lorsque +lui +lui-même +m +ma +maint +mais +malgré +me +même +mêmes +merci +mes +mien +mienne +miennes +miens +mille +mince +moi +moi-même +moins +mon +moyennant +n +na +ne +néanmoins +neuf +neuvième +ni +nombreuses +nombreux +non +nos +notre +nôtre +nôtres +nous +nous-mêmes +nul +o +o| +ô +oh +ohé +olé +ollé +on +ont +onze +onzième +ore +ou +où +ouf +ouias +oust +ouste +outre +p +paf +pan +par +parmi +partant +particulier +particulière +particulièrement +pas +passé +pendant +personne +peu +peut +peuvent +peux +pff +pfft +pfut +pif +plein +plouf +plus +plusieurs +plutôt +pouah +pour +pourquoi +premier +première +premièrement +près +proche +psitt +puisque +q +qu +quand +quant +quanta +quant-à-soi +quarante +quatorze +quatre +quatre-vingt +quatrième +quatrièmement +que +quel +quelconque +quelle +quelles +quelque +quelques +quelqu'un +quels +qui +quiconque +quinze +quoi +quoique +r +revoici +revoilà +rien +s +sa +sacrebleu +sans +sapristi +sauf +se +seize +selon +sept +septième +sera +seront +ses +si +sien +sienne +siennes +siens +sinon +six +sixième +soi +soi-même +soit +soixante +son +sont +sous +stop +suis +suivant +sur +surtout +t +ta +tac +tant +te +té +tel +telle +tellement +telles +tels +tenant +tes +tic +tien +tienne +tiennes +tiens +toc +toi +toi-même +ton +touchant +toujours +tous +tout +toute +toutes +treize +trente +très +trois +troisième +troisièmement +trop +tsoin +tsouin +tu +u +un +une +unes +uns +v +va +vais +vas +vé +vers +via +vif +vifs +vingt +vivat +vive +vives +vlan +voici +voilà +vont +vos +votre +vôtre +vôtres +vous +vous-mêmes +vu +w +x +y +z +zut diff --git a/templates/help.html b/templates/help.html index 67e2557..ff96432 100755 --- a/templates/help.html +++ b/templates/help.html @@ -55,7 +55,7 @@

Help on Topics

Contact and Issues

-

Comments are welcome, as are reports of bugs and typos. Please use the project's issue tracker on GitHub.

+

Please use the project's issue tracker on GitHub (https://github.com/DARIAH-DE/TopicsExplorer) for comments, bugs, and typos.

For general questions, write a mail to Dr. Steffen Pielström, in case of any technical questions to Severin Simmler.

What is Topic Modeling?

diff --git a/templates/index.html b/templates/index.html index 594f13e..e4e7d3d 100755 --- a/templates/index.html +++ b/templates/index.html @@ -72,16 +72,16 @@

1. Preprocessing

1.1. Reading a Corpus of Documents

-

For this workflow, you will need a corpus (a set of texts) as plain text (.txt) or TEI XML (.xml). Use the button below to select multiple text files. To gain better results, choose at least five documents (but the more the +

For this workflow, you will need a corpus (a set of texts) as plain text (.txt) or XML (.xml). TEI encoded XML is fully supported to process only the text part. Use the button below to select multiple text files. To gain better results, choose at least five documents (but the more the better).

- Tip: The TextGrid Repository (https://textgridrep.org) is a great place to start searching for text data. It's Open Access and provides a lot of literary texts in valid and well-formed TEI XML. + Tip: The TextGrid Repository (https://textgridrep.org) is a great place to start searching for text data. It's Open Access and provides a lot of literary texts in valid and well-formed TEI XML.



1.2. Tokenization

-

An important preprocessing step is tokenization. Without identifying tokens, it is difficult to extract necessary information, such as most frequent tokens, also known as stopwords, or token frequencies in general. In this +

An important preprocessing step is tokenization. Without identifying tokens, it is difficult to extract necessary information, such as token frequencies in general, or most frequent tokens, also known as stopwords. In this application, one token consists of one or more characters, optionally followed by exactly one punctuation (a hyphen or something related), followed by one or more characters. For example, the phrase “her father's arm-chair” will be tokenized as ["her", "father's", "arm-chair"].

1.3. Cleaning the Corpus

@@ -102,14 +102,17 @@

2. Modeling

An iteration is a process of repeating the same action multiple times to achieve a specific goal. This is how LDA works. The number of sampling iterations should be a trade-off between the time taken to complete sampling and the quality of the model. The default value produces quite good results, but feel free to increase the number of iterations.

-
+
This step can take quite a while! Meaning something between some seconds and some hours, depending on corpus size and the number of iterations.

3. Visualizing

When using LDA to explore text collections, we are typically interested in examining texts in terms of their constituent topics (instead of word frequencies). Because the number of topics is so much smaller than the number of unique vocabulary elements (say, 10 versus 10,000), a range of data visualization methods become available. As you will see, all of the provided visualizations are interactive, but you will have the ability to save the plots - as a static image file.


+ as a static image file.

+
+ + You need an active internet connection!


diff --git a/templates/model.html b/templates/model.html index 75cf30a..b775816 100755 --- a/templates/model.html +++ b/templates/model.html @@ -45,9 +45,6 @@
  • Reset
  • -
  • - Save Data -
  • Help
  • @@ -66,11 +63,16 @@

    Topics – Easy Topic Modeling

    1. Corpus and Parameter Summary

    -

    +

    All parameters, including some corpus statistics, are summed up in the following table. This kind of information might be useful, if you create more than one topic model and want to compare the results. The most common way to evaluate a probabilistic model is to measure the log-likelihood (if you are interested in the evaluation of probabilistic models, have a look at Wallach et al. 2009: Evaluation Methods for Topic Models, a mathematical approach). If you increase the number of iterations, your model gets better, and you will see, the log-likelihood also increases until a certain point. This is how you might find out the ideal number of iterations. {% for table in parameter %} {{ table|safe }} {% endfor %}
    + As you can see, your corpus is much smaller after cleaning. You either defined a threshold for most frequent words, or selected an external stopwords list. In addition so-called hapax legomena have been removed. In corpus linguistics, a hapax legomenon is a word that occurs only once within a context. So, if a word occurs only once in a document, it is very likely that the word is semantically insignificant – meaning not useful for the topic modeling algorithm.

    {{ corpus_boxplot_div|safe }}


    +
    + + FYI: All of the generated data (tables and graphics) was automatically saved in a ZIP archive (topicmodeling.zip) in your current working directory: {{ cwd|safe }}. +

    2. Inspecting the Topic Model

    Topic Models are unsupervised. It is called unsupervised, because you did not have any labels describing the semantic structures or anything related, but only pure word frequencies. Since the examples given to the algorithm are unlabeled, there is no evaluation of the accuracy, or how good your model is. So, it is up to you now by inspecting the model to decide whether you are satisfied with its performance or not. @@ -89,9 +91,11 @@

    2.2. Topics and Documents

    thematic developments over a set of texts as well as a single text, akin to a dynamic topic model. What also can become apparent here, is that some topics correlate highly with a specific author or group of authors, while other topics correlate highly with a specific text or group of texts. All in all, this displays two of LDA's properties – its use as a distant reading tool that aims to get at text meaning, and its use as a provider of data that can be further used in computational analysis, such as document classification or authorship attribution.


    {{ heatmap_div|safe }}

    -

    2.3. Topic Proportions of Documents

    +

    2.3. Distribution of Topics

    +

    In the following graphic, you can access one dimension of the information displayed in the heatmap above. This might be a more clear approach, if you are interested in a specific topic, or, more precisely, how the topic is distributed over the documents of your corpus. Use the dropdown menu to select a topic. The proportions you can see by default is based on the first topic.

    {{ topics_div|safe }}
    -

    2.4. Document Proportions of Topics

    +

    2.4. Distribution of Documents

    +

    Similar thing as above, you can access the other dimension displayed in the heatmap. So, if you are intereseted in a specific document, you have the ability to select it via the dropdown menu and inspect its proportions. The bars displayed by default are based on the first document.

    {{ documents_div|safe }}

    2. Diving Deeper into Topic Modeling

    We want to empower users with little or no previous experience and programming skills to create custom workflows mostly using predefined functions within a familiar environment. So, if this practical introduction aroused your interest and diff --git a/templates/modeling.html b/templates/modeling.html index 6e74064..335200c 100644 --- a/templates/modeling.html +++ b/templates/modeling.html @@ -49,7 +49,7 @@

    -


    +


    Collecting user input ...

    FYI: This might take a while...
    In the meanwhile, have a look at
    our Jupyter notebook introducing
    topic modeling with MALLET.
    {% for info_1, info_2, info_3, info_4, info_5 in logging %} diff --git a/topicsexplorer.py b/topicsexplorer.py new file mode 100755 index 0000000..4d51470 --- /dev/null +++ b/topicsexplorer.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +from pathlib import Path +from PyQt5.QtGui import QIcon +from PyQt5.QtWidgets import QApplication +from PyQt5.QtWebEngineWidgets import QWebEngineView +from PyQt5.QtCore import QUrl, QThread + +import lzma +import pickle +import time +import re +import logging +from lxml import etree +from bokeh.plotting import figure +from bokeh.models import CustomJS, ColumnDataSource, HoverTool +from bokeh.models.widgets import Dropdown +from bokeh.layouts import column +import lda +from threading import Thread +import queue + +from flask import Flask, request, render_template, Response, stream_with_context +import pandas as pd +import time +from bokeh.plotting import output_file, save +from bokeh.embed import components +from dariah_topics import preprocessing +from dariah_topics import postprocessing +from dariah_topics import visualization +import tempfile +import shutil +import numpy as np +from werkzeug.utils import secure_filename + + +PORT = 5000 +ROOT_URL = 'http://localhost:{}'.format(PORT) + +class FlaskThread(QThread): + def __init__(self, application): + QThread.__init__(self) + self.application = application + + def __del__(self): + self.wait() + + def run(self): + self.application.run(port=PORT) + + +def ProvideGui(application): + qtapp = QApplication(sys.argv) + + webapp = FlaskThread(application) + webapp.start() + + qtapp.aboutToQuit.connect(webapp.terminate) + + webview = QWebEngineView() + webview.resize(1200, 660) + webview.setWindowTitle('Topics Explorer') + webview.setWindowIcon(QIcon(str(Path('static', 'img', 'page_icon.png')))) + + webview.load(QUrl(ROOT_URL)) + webview.show() + + return qtapp.exec_() + + +if __name__ == '__main__': + from webapp import app + sys.exit(ProvideGui(app)) diff --git a/webapp.spec b/topicsexplorer.spec similarity index 57% rename from webapp.spec rename to topicsexplorer.spec index 0326154..9bad1b5 100755 --- a/webapp.spec +++ b/topicsexplorer.spec @@ -3,10 +3,10 @@ block_cipher = None -a = Analysis(['webapp.py'], +a = Analysis(['topicsexplorer.py'], pathex=[], binaries=[], - datas=[('static', 'static'), ('templates', 'templates'), ('bokeh_templates', 'bokeh_templates')], + datas=[('webapp.py', '.'), ('utils.py', '.'), ('static', 'static'), ('templates', 'templates'), ('bokeh_templates', 'bokeh_templates')], hiddenimports=[], hookspath=['hooks'], runtime_hooks=[], @@ -18,14 +18,18 @@ pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) exe = EXE(pyz, a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='webapp', + exclude_binaries=True, + name='DARIAH Topics Explorer', debug=False, strip=False, upx=False, - runtime_tmpdir=None, console=False, #icon='static/img/app_icon.png', for macos icon='static/img/app_icon.ico') +coll = COLLECT(exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=False, + name='DARIAH Topics Explorer') diff --git a/utils.py b/utils.py index 9bc989b..b461036 100644 --- a/utils.py +++ b/utils.py @@ -5,18 +5,22 @@ import pickle import time import re +from pathlib import Path +import logging from lxml import etree from bokeh.plotting import figure from bokeh.models import CustomJS, ColumnDataSource, HoverTool from bokeh.models.widgets import Dropdown from bokeh.layouts import column import lda +from threading import Thread +import queue __author__ = "Severin Simmler" __email__ = "severin.simmler@stud-mail.uni-wuerzburg.de" -TOOLS = 'hover, pan, reset, save, wheel_zoom, zoom_in, zoom_out' +TOOLS = 'hover, pan, reset, wheel_zoom, zoom_in, zoom_out' JAVASCRIPT = """ var f = cb_obj.value; var options = %s; @@ -37,13 +41,12 @@ def compress(data, filepath): with open(filepath, 'wb') as file: - file.write(lzma.compress(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))) + pickle.dump(data, file) def decompress(filepath): with open(filepath, 'rb') as file: - data = lzma.decompress(file.read()) - return pickle.loads(data) + return pickle.load(file) def process_xml(file): @@ -73,7 +76,7 @@ def outliers(group): (group.score < lower.loc[cat]['score'])]['score'] out = groups.apply(outliers).dropna() - fig = figure(tools='save', background_fill_color='#EFE8E2', title='', x_range=x_labels, + fig = figure(tools='', background_fill_color='#EFE8E2', title='', x_range=x_labels, logo=None, sizing_mode='fixed', plot_width=500, plot_height=350) qmin = groups.quantile(q=0.00) @@ -113,7 +116,10 @@ def barchart(document_topics, height, topics=None, script=JAVASCRIPT, tools=TOOL option = re.sub(' ', '_', option) bar = fig.hbar(y='Describer', right='Proportion', source=source, height=0.5, color='#053967') - bar.visible = False + if i == 0: + bar.visible = True + else: + bar.visible = False plots[option] = bar fig.xgrid.grid_line_color = None @@ -123,16 +129,17 @@ def barchart(document_topics, height, topics=None, script=JAVASCRIPT, tools=TOOL fig.xaxis.major_label_text_font_size = '9pt' fig.yaxis.major_label_text_font_size = '9pt' - callback = CustomJS(args=plots, code=script % list(plots.keys())) + options = list(plots.keys()) + callback = CustomJS(args=plots, code=script % options) - menu = [(select, re.sub(' ', '_', option)) for select, option in zip(document_topics.index, options)] if topics is not None: selection = [' '.join(topics.iloc[i].tolist()) + ' ...' for i in range(topics.shape[0])] - menu = [(select, re.sub(' ', '_', option)) for select, option in zip(selection, options)] - dropdown = Dropdown(label='Select topic to display proportion', menu=menu, callback=callback) + menu = [(select, option) for select, option in zip(selection, options)] + label = 'Select topic to display proportions' else: menu = [(select, option) for select, option in zip(document_topics.index, options)] - dropdown = Dropdown(label='Select document to display proportion', menu=menu, callback=callback) + label = 'Select document to display proportions' + dropdown = Dropdown(label=label, menu=menu, callback=callback) return column(dropdown, fig, sizing_mode='scale_width') @@ -157,6 +164,24 @@ def read_logfile(logfile): return 0 -def lda_modeling(document_term_arr, n_topics, n_iter): +def lda_modeling(document_term_arr, n_topics, n_iter, tempdir): + file = str(Path(tempdir, 'topicmodeling.log')) + handler = logging.FileHandler(file, 'w') + lda_log = logging.getLogger('lda') + lda_log.setLevel(logging.INFO) + lda_log.addHandler(handler) + model = lda.LDA(n_topics=n_topics, n_iter=n_iter) - return model.fit(document_term_arr) + model.fit(document_term_arr) + with open(file, 'a', encoding='utf-8') as f: + f.write('DONE') + return model + + +def enthread(target, args): + q = queue.Queue() + def wrapper(): + q.put(target(*args)) + t = Thread(target=wrapper) + t.start() + return q diff --git a/webapp.py b/webapp.py index 29d03a1..f83af01 100755 --- a/webapp.py +++ b/webapp.py @@ -8,7 +8,6 @@ import time from bokeh.plotting import output_file, save from bokeh.embed import components -from bokeh.resources import INLINE from dariah_topics import preprocessing from dariah_topics import postprocessing from dariah_topics import visualization @@ -16,7 +15,6 @@ import tempfile import sys import shutil -from multiprocessing import Pool import numpy as np from werkzeug.utils import secure_filename @@ -26,23 +24,21 @@ tempdir = tempfile.mkdtemp() -NUM_KEYS = 10 +NUM_KEYS = 8 if getattr(sys, 'frozen', False): app = Flask(__name__, template_folder=str(Path(sys._MEIPASS, 'templates')), static_folder=str(Path(sys._MEIPASS, 'static'))) + bokeh_resources = str(Path(sys._MEIPASS, 'bokeh_templates')) else: app = Flask(__name__) + bokeh_resources = 'bokeh_templates' @app.route('/') def index(): - lda_log = logging.getLogger('lda') - lda_log.setLevel(logging.INFO) - handler = logging.FileHandler(str(Path(tempdir, 'topicmodeling.log')), 'w') - lda_log.addHandler(handler) return render_template('index.html') @@ -58,11 +54,11 @@ def modeling(): @app.route('/model') def model(): - data = utils.decompress(str(Path(tempdir, 'data.bin.xz'))) - parameter = pd.read_csv(str(Path(tempdir, 'parameter.csv')), index_col=0) + data = utils.decompress(str(Path(tempdir, 'data.pickle'))) + parameter = pd.read_csv(str(Path(tempdir, 'parameter.csv')), index_col=0, encoding='utf-8') parameter.columns = [''] data['parameter'] = [parameter.to_html(classes=['parameter'], border=0)] - data['topics'] = [pd.read_csv(str(Path(tempdir, 'topics.csv')), index_col=0).to_html(classes='topics')] + data['topics'] = [pd.read_csv(str(Path(tempdir, 'topics.csv')), index_col=0, encoding='utf-8').to_html(classes='topics')] return render_template('model.html', **data) @@ -110,6 +106,7 @@ def create_model(): tokenized_corpus[filename.stem] = tokens parameter['Corpus size (raw), in tokens'] += len(tokens) file.flush() + yield "Creating document-term matrix ...", INFO_2A, INFO_3A, INFO_4A, INFO_5A document_labels = tokenized_corpus.index document_term_matrix = preprocessing.create_document_term_matrix(tokenized_corpus, document_labels) @@ -148,17 +145,19 @@ def create_model(): INFO_5B = INFO_5B.format(parameter['Number of topics']) yield "Initializing LDA topic model ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B - - pool = Pool(processes=2) - model = pool.apply_async(utils.lda_modeling, [document_term_arr, user_input['num_topics'], user_input['num_iterations']]) + + model = utils.enthread(target=utils.lda_modeling, + args=(document_term_arr, user_input['num_topics'], user_input['num_iterations'], tempdir)) while True: - yield 'Iteration {0} of {1} ...'.format(pool.apply_async(utils.read_logfile, [str(Path(tempdir, 'topicmodeling.log'))]).get(), user_input['num_iterations']), INFO_2B, INFO_3B, INFO_4B, INFO_5B - if model.ready(): + msg = utils.read_logfile(str(Path(tempdir, 'topicmodeling.log'))) + + if msg == None: model = model.get() - pool.close() break + else: + yield 'Iteration {0} of {1} ...'.format(msg, user_input['num_iterations']), INFO_2B, INFO_3B, INFO_4B, INFO_5B - parameter['The model log likelihood'] = round(model.loglikelihood()) + parameter['The model log-likelihood'] = round(model.loglikelihood()) yield "Accessing topics ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B topics = postprocessing.show_topics(model=model, vocabulary=vocabulary, num_keys=NUM_KEYS) @@ -184,7 +183,8 @@ def create_model(): fig = visualization.PlotDocumentTopics(document_topics_heatmap, enable_notebook=False) heatmap = fig.interactive_heatmap(height=height, - sizing_mode='scale_width') + sizing_mode='scale_width', + tools='hover, pan, reset, wheel_zoom, zoom_in, zoom_out') output_file(str(Path(tempdir, 'heatmap.html'))) save(heatmap) @@ -200,7 +200,7 @@ def create_model(): height = 10 * 18 else: height = document_topics.shape[1] * 18 - topics_barchart = utils.barchart(document_topics, height=height) + topics_barchart = utils.barchart(document_topics, height=height, topics=topics) topics_script, topics_div = components(topics_barchart) output_file(str(Path(tempdir, 'topics_barchart.html'))) save(topics_barchart) @@ -209,13 +209,17 @@ def create_model(): height = 10 * 18 else: height = document_topics.shape[0] * 18 - documents_barchart = utils.barchart(document_topics.T, height=height, topics=topics) + documents_barchart = utils.barchart(document_topics.T, height=height) documents_script, documents_div = components(documents_barchart) output_file(str(Path(tempdir, 'document_topics_barchart.html'))) save(documents_barchart) - js_resources = INLINE.render_js() - css_resources = INLINE.render_css() + + with open(str(Path(bokeh_resources, 'render_js.txt')), 'r', encoding='utf-8') as file: + js_resources = file.read() + with open(str(Path(bokeh_resources, 'render_css.txt')), 'r', encoding='utf-8') as file: + css_resources = file.read() + end = time.time() passed_time = round((end - start) / 60) @@ -225,11 +229,12 @@ def create_model(): parameter['Passed time, in minutes'] = passed_time parameter = pd.DataFrame(pd.Series(parameter)) - topics.to_csv(str(Path(tempdir, 'topics.csv'))) - document_topics.to_csv(str(Path(tempdir, 'document_topics.csv'))) - parameter.to_csv(str(Path(tempdir, 'parameter.csv'))) - - shutil.make_archive(str(Path(app.static_folder, 'topicmodeling')), 'zip', tempdir) + topics.to_csv(str(Path(tempdir, 'topics.csv')), encoding='utf-8') + document_topics.to_csv(str(Path(tempdir, 'document_topics.csv')), encoding='utf-8') + parameter.to_csv(str(Path(tempdir, 'parameter.csv')), encoding='utf-8') + + cwd = str(Path(*Path.cwd().parts[:-1])) + shutil.make_archive(str(Path(cwd, 'topicmodeling')), 'zip', tempdir) data = {'heatmap_script': heatmap_script, 'heatmap_div': heatmap_div, @@ -240,8 +245,9 @@ def create_model(): 'js_resources': js_resources, 'css_resources': css_resources, 'corpus_boxplot_script': corpus_boxplot_script, - 'corpus_boxplot_div': corpus_boxplot_div} - utils.compress(data, str(Path(tempdir, 'data.bin.xz'))) + 'corpus_boxplot_div': corpus_boxplot_div, + 'cwd': cwd} + utils.compress(data, str(Path(tempdir, 'data.pickle'))) yield 'render_result', '', '', '', ''