Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Java #2

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions .directory
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Dolphin]
Timestamp=2017,7,29,0,16,50
Version=3
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839


# Bin and txt files
*.bin
*.txt
*.npy



# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml
Expand Down
7 changes: 0 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ ADD . $HOME_DIR/word2vec

RUN apt-get -y update && \
apt-get -y install \
python3-pip \
language-pack-en \
vim \
libopenblas-dev
Expand Down Expand Up @@ -60,12 +59,6 @@ ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

############################################################
# Exposing ports
############################################################

EXPOSE 9300 9200 2181 9092

############################################################
# Running the uberjar
############################################################
Expand Down
9 changes: 0 additions & 9 deletions conf/parser-conf.properties

This file was deleted.

21 changes: 15 additions & 6 deletions conf/word2vec-default.properties
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@


# path to the text corpus (text corpus should be a txt file where each line represents a document).
input.corpus.path=corpus/sample-data.txt
output.model.save.path=model/model-alpha.bin

# path (name) to the word2vec model to be saved.
output.model.save.path=model/model-v-0.1.0-alpha.bin

# min frequency of words to be used in training (words with less frequency than this will be dropped off the vocabulary).
min.word.frequency=2
number.of.iterations=100
layer.size=300

# number of training epochs.
number.of.iterations=10

# dimension of the word vectors.
layer.size=250

# window size for choosing the context of a word.
window.size=5
learning.rate=0.01

# learning rate for the algorithm.
learning.rate=0.015
9 changes: 5 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
version: '2'
services:
word2vec_trainer:
build: ./
#image: registry.gitlab.com/hosseinabedi/meliora:development
build: .
container_name: word2wec
volumes:
- ./conf:/home/badger/conf
- ./log:/home/badger/log
- .corpus:/home/badger/corpus
- ./conf:/home/word2vec/conf
- ./log:/home/word2vec/log
- ./corpus:/home/word2vec/corpus

322 changes: 0 additions & 322 deletions meliora.iml

This file was deleted.

Empty file added model/.keep
Empty file.
Binary file removed model/model-alpha.bin
Binary file not shown.
151 changes: 151 additions & 0 deletions src/main/java/Cleaner.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
*
* @author behnam
*/
public class Cleaner {

private final String punctuationPath = "stoplists/Cleaner/Punctuations.txt";
private final String conjPath = "stoplists/Persian/CONJ.txt";
private final String detPath = "stoplists/Persian/DET.txt";
private final String pPath = "stoplists/Persian/P.txt";
private final String postpPath = "stoplists/Persian/POSTP.txt";
private final String proPath = "stoplists/Persian/PRO.txt";
private final String stopwordPath = "stoplists/Persian/persian.txt";

private List<String> punctuations;
private List<String> conj;
private List<String> det;
private List<String> p;
private List<String> postp;
private List<String> pro;
private List<String> stopword;

public static final Pattern RTL_CHARACTERS = Pattern.compile("[\u0600-\u06FF\u0750-\u077F\u0590-\u05FF\uFE70-\uFEFF]");

public Cleaner() throws FileNotFoundException, UnsupportedEncodingException, IOException {
punctuations = initialize(punctuationPath);
// conj = initialize(conjPath);
// det = initialize(detPath);
// p = initialize(pPath);
// postp = initialize(postpPath);
// pro = initialize(proPath);
stopword = initialize(stopwordPath);
}

private List<String> initialize(String path) throws FileNotFoundException, UnsupportedEncodingException, IOException{
List<String> result = new ArrayList<>();
File file = new File(path);
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF8"));
String line = in.readLine();
while(line != null){
result.add(line);
line = in.readLine();
}
return result;
}

private String removeUnwantedTokens(String text, List<String> list){
StringTokenizer tokenizer = new StringTokenizer(text);
String result = "";
while(tokenizer.hasMoreTokens()){
String token = tokenizer.nextToken();
if(!contains(list, token)){
result += " " + token;
}
}
return result.trim();
}

private boolean contains(List<String> list, String word){
for(int i=0;i<list.size();i++){
if(list.get(i).equalsIgnoreCase(word)){
return true;
}
}
return false;
}

private String removePunctuation(String text){
StringTokenizer tokenizer = new StringTokenizer(text);
String result = "";
while(tokenizer.hasMoreTokens()){
String token = tokenizer.nextToken();
token = containsPunctuation(token);
if(!token.isEmpty()){
result += " " + token;
}
}
return result;
}

public String containsPunctuation(String word){
for(int i=0;i<punctuations.size();i++){
if(word.contains(punctuations.get(i))){
word = word.replace(punctuations.get(i), " ");
}
}
return word.trim();
}

public String [] splitAtSpaces(String text){
return text.split("\\s+");
}

public String clean(String text){
String finalText = "";
text = text.trim();
text = text.replace("\r\n", " ").replace("\n", " ");
text = text.replaceAll("[^\\u0600-\\u065F\\u066A-\\u06EF\\u06FA-\\u06FF-\\s]","");
StringTokenizer tokenizer = new StringTokenizer(text);
String word;
while(tokenizer.hasMoreTokens()){
word = tokenizer.nextToken();
Matcher matcher = RTL_CHARACTERS.matcher(word);
if(matcher.find()){
finalText = finalText + " " + word.trim();
}
}
finalText = removePunctuation(finalText);
// finalText = removeUnwantedTokens(finalText, conj);
// finalText = removeUnwantedTokens(finalText, det);
// finalText = removeUnwantedTokens(finalText, p);
// finalText = removeUnwantedTokens(finalText, postp);
// finalText = removeUnwantedTokens(finalText, pro);
finalText = removeUnwantedTokens(finalText, stopword);
return finalText;
}


public static void main(String[] args) throws Exception {

Cleaner cleaner = new Cleaner();
String cleanText = cleaner.clean("به گزارش خبرگزاری فارس، پندار خمارلو در گفتگو با سایت رسمی باشگاه پرسپولیس اظهار داشت: برای برگزاری اولین دربی برون مرزی ، مسایل گوناگونی مورد توجه و بررسی طرف\u200Cهای دست اندر کار بود که در نهایت با جمع بندی های صورت گرفته ، برگزاری این بازی منتفی شد.\n" +
"\n" +
"وی ادامه داد: براین اساس در بازگشت تیم از اردوی اوکراین که صبح فردا خواهد بود، جلسه\u200Cای بین آقای طاهری و برانکو ایوانکوویچ برگزار و برنامه های تیم برای بازی در سوپر جام ایران بررسی خواهد شد و تیم فوتبال پرسپولیس خود را آماده حضور پر قدرت در این بازی خواهد کرد.");

String [] tokens = cleaner.splitAtSpaces(cleanText);

System.out.println(tokens[0]);
}


}
Loading