Skip to content

Commit

Permalink
Merge pull request #80 from Myyyvothrr/main
Browse files Browse the repository at this point in the history
core pnas
  • Loading branch information
Myyyvothrr authored Aug 7, 2024
2 parents f7ca2f9 + c5a8d71 commit d69a86c
Show file tree
Hide file tree
Showing 4 changed files with 1,009 additions and 79 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<typesystem.version>3.0.1</typesystem.version>
<typesystem.version>3.0.3</typesystem.version>
<utilities.version>3.0.0</utilities.version>
<uima.version>3.5.0</uima.version>
<dkpro.core.version>2.4.0</dkpro.core.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ public static ConcurrentLinkedQueue<String> removeIfInTarget(ConcurrentLinkedQue
addFilesToConcurrentList(targetDir, targetEnding, targetFilePaths);
}
System.out.println("Found " + targetFilePaths.size() + " files in target location");
System.out.println("Source location has: " + paths.size());

List<String> cleanList = new ArrayList<>();
if (!targetFilePaths.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.GZIPOutputStream;
import java.util.List;
import java.util.ArrayList;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
Expand Down Expand Up @@ -59,7 +61,16 @@ public void testSpacy() throws Exception {
DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
new DUUIFileReader(
sourceLocation.toString(),
"xmi.gz"
"html.gz.xmi.gz",
1,
-1,
false,
"",
false,
null,
-1,
targetLocation.toString(),
"html.gz.xmi.gz"
)
);

Expand Down Expand Up @@ -108,68 +119,94 @@ public void testSpacy() throws Exception {

@Test
public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException {
Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_html_Funkmast.csv");
try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
long counter = 0;
boolean skipFirstLine = true;
String line;
while ((line = reader.readLine()) != null) {
try {
counter += 1;
if (counter % 50 == 0) {
System.out.println(counter);
List<String> tasks = new ArrayList<>();
tasks.add("Gruene-Sosse");
tasks.add("Hitzestift");
tasks.add("Tetra-Pak");
tasks.add("Medizin-Atmung");
tasks.add("Medizin-Kreislauf");
tasks.add("Medizin-Mittelohr");
tasks.add("Nudging-Aufgabe");
tasks.add("Piloten-Streik-Aufgabe");
tasks.add("Startup-Aufgabe");
tasks.add("Start-Up-Aufgabe");
tasks.add("Windpark-Aufgabe");

for (String task : tasks) {
Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_html_" + task + "_v2.csv");
try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
long counter = 0;
long countNew = 0;
long countExists = 0;
boolean skipFirstLine = true;
String line;
while ((line = reader.readLine()) != null) {
try {
counter += 1;
if (counter % 50 == 0) {
System.out.println(counter);
}

if (skipFirstLine) {
skipFirstLine = false;
continue;
}

line = line.trim();
String[] fields = line.split("\t", -1);

String url = fields[7];
if (!url.contains("google.com/search") && !url.contains("google.de/search")) {
continue;
}

String user = fields[9];
String session = fields[4];
String html = fields[10];

String title = html + ".html.gz";
String docId = user + "/" + session + "/" + title;
String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/html/";
String docBaseUri = collectionId;
String docUri = docBaseUri + docId;

Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html/" + docId);
Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html_xmi_google_serps/" + docId + ".xmi.gz");
if (Files.exists(output)) {
countExists++;
continue;
}

JCas jCas = HTMLGoogleSERPLoader.load(filename, null);

DocumentMetaData dmd = new DocumentMetaData(jCas);
dmd.setDocumentTitle(title);
dmd.setDocumentId(docId);
dmd.setDocumentUri(docUri);
dmd.setCollectionId(collectionId);
dmd.setDocumentBaseUri(docBaseUri);
dmd.addToIndexes();

Files.createDirectories(output.getParent());
try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) {
XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true);
xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1");
xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString());
XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
}

countNew++;
}

if (skipFirstLine) {
skipFirstLine = false;
continue;
}

line = line.trim();
String[] fields = line.split(",", -1);

String url = fields[7];
//if (!url.contains("google.com/search") && !url.contains("google.de/search")) {
if (!url.contains("google.de/search")) {
continue;
}

String user = fields[9];
String session = fields[4];
String html = fields[10];

String title = html + ".html.gz";
String docId = user + "/" + session + "/" + title;
String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/html/";
String docBaseUri = collectionId;
String docUri = docBaseUri + docId;

Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html/" + docId);
JCas jCas = HTMLGoogleSERPLoader.load(filename, null);

DocumentMetaData dmd = new DocumentMetaData(jCas);
dmd.setDocumentTitle(title);
dmd.setDocumentId(docId);
dmd.setDocumentUri(docUri);
dmd.setCollectionId(collectionId);
dmd.setDocumentBaseUri(docBaseUri);
dmd.addToIndexes();

Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html_xmi_google_serps/" + docId + ".xmi.gz");
Files.createDirectories(output.getParent());
try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) {
XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true);
xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1");
xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString());
XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("Count " + counter);
System.out.println(" New: " + countNew);
System.out.println(" Exists: " + countExists);
}
catch (Exception e) {
e.printStackTrace();
}
}
}
}
}

@Test
Expand Down
Loading

0 comments on commit d69a86c

Please sign in to comment.