Skip to content

Commit

Permalink
adapt segmention
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Sep 29, 2024
1 parent c664d63 commit 8e09ba4
Show file tree
Hide file tree
Showing 6 changed files with 533 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class BasicStructureBuilder {
* @param b integer
* @param doc a document
*/
private static void addBlockToCluster(Integer b, Document doc) {
/*private static void addBlockToCluster(Integer b, Document doc) {
// get block features
Block block = doc.getBlocks().get(b);
String font = block.getFont();
Expand Down Expand Up @@ -100,7 +100,7 @@ private static void addBlockToCluster(Integer b, Document doc) {
cluster.addBlock2(b);
doc.getClusters().add(cluster);
}
}
}*/

static public Document generalResultSegmentation(Document doc, String labeledResult, List<LayoutToken> documentTokens) {
List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);
Expand Down
37 changes: 20 additions & 17 deletions grobid-core/src/main/java/org/grobid/core/document/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,16 @@ public class Document implements Serializable {
public static final int MAX_FIG_BOX_DISTANCE = 70;
protected transient final DocumentSource documentSource;

protected String pathXML = null; // XML representation of the current PDF file
protected String pathXML = null; // base ALTO XML representation of the current PDF file

protected String lang = null;

// layout structure of the document
protected transient List<Page> pages = null;
protected transient List<Cluster> clusters = null;
//protected transient List<Cluster> clusters = null;
protected transient List<Block> blocks = null;

protected List<Integer> blockDocumentHeaders = null;
//protected List<Integer> blockDocumentHeaders = null;

protected transient FeatureFactory featureFactory = null;

Expand All @@ -118,10 +118,6 @@ public class Document implements Serializable {

protected transient ReferenceMarkerMatcher referenceMarkerMatcher;

public void setImages(List<GraphicObject> images) {
this.images = images;
}

// list of bitmaps and vector graphics of the document
protected transient List<GraphicObject> images = null;

Expand All @@ -131,6 +127,7 @@ public void setImages(List<GraphicObject> images) {
// the document outline (or bookmark) embedded in the PDF, if present
protected transient DocumentNode outlineRoot = null;

// the metadata embedded in the PDF, if present
protected transient Metadata metadata = null;

protected transient Multimap<Integer, GraphicObject> imagesPerPage = LinkedListMultimap.create();
Expand All @@ -144,13 +141,15 @@ public void setImages(List<GraphicObject> images) {
protected int documentLenghtChar = -1; // length here is expressed as number of characters

// not used
protected int beginBody = -1;
protected int beginReferences = -1;
//protected int beginBody = -1;
//protected int beginReferences = -1;

protected boolean titleMatchNum = false; // true if the section titles of the document are numbered

protected transient List<Figure> figures;
protected transient Predicate<GraphicObject> validGraphicObjectPredicate;

// general parameter indicating bounding box margin when considering area
protected int m;

protected transient List<Table> tables;
Expand Down Expand Up @@ -188,6 +187,10 @@ public static Document createFromText(String text) {
return doc;
}

public void setImages(List<GraphicObject> images) {
this.images = images;
}

public void setLanguage(String l) {
lang = l;
}
Expand Down Expand Up @@ -723,9 +726,9 @@ public void setTei(String tei) {
this.tei = tei;
}

public List<Integer> getBlockDocumentHeaders() {
/*public List<Integer> getBlockDocumentHeaders() {
return blockDocumentHeaders;
}
}*/

public DocumentNode getOutlineRoot() {
return outlineRoot;
Expand All @@ -752,17 +755,17 @@ public Page getPage(int num) {
return pages.get(num - 1);
}

public List<Cluster> getClusters() {
/*public List<Cluster> getClusters() {
return clusters;
}
}*/

public void setBlockDocumentHeaders(List<Integer> blockDocumentHeaders) {
/*public void setBlockDocumentHeaders(List<Integer> blockDocumentHeaders) {
this.blockDocumentHeaders = blockDocumentHeaders;
}
}*/

public void setClusters(List<Cluster> clusters) {
/*public void setClusters(List<Cluster> clusters) {
this.clusters = clusters;
}
}*/

public void setPages(List<Page> pages) {
this.pages = pages;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.grobid.core.engines;

import org.apache.commons.lang3.tuple.Pair;
import eugfc.imageio.plugins.PNMRegistry;
import org.apache.commons.io.FileUtils;

import org.grobid.core.GrobidModels;
import org.grobid.core.engines.tagging.GenericTagger;
Expand All @@ -9,6 +11,7 @@
import org.grobid.core.document.DocumentPiece;
import org.grobid.core.document.DocumentPointer;
import org.grobid.core.document.BasicStructureBuilder;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.engines.label.SegmentationLabels;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.engines.label.TaggingLabel;
Expand All @@ -31,13 +34,17 @@
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.Triple;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.exceptions.*;

import com.google.common.collect.Multimap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.File;

import java.util.ArrayList;
import java.util.Collections;
Expand All @@ -51,6 +58,9 @@
import java.util.HashMap;
import java.util.Comparator;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;

/**
* A model for segmenting the figure areas. The model is applied after the Segmentation model and
* work as follow:
Expand Down Expand Up @@ -89,14 +99,27 @@ public FigureSegmenterParser() {
figureSegmenterParserDown = null;//TaggerFactory.getTagger(GrobidModels.FIGURE_SEGMENTER_DOWN);
}

public Document extract(Document doc) {
public Document extract(DocumentSource documentSource, GrobidAnalysisConfig config) {

Document doc = new Document(documentSource);
if (config.getAnalyzer() != null)
doc.setAnalyzer(config.getAnalyzer());
doc.addTokenizedDocument(config);
doc = prepareDocument(doc);

File assetFile = config.getPdfAssetPath();
if (assetFile != null) {
dealWithImages(documentSource, doc, assetFile, config);
}

// figure anchors are based on VectorGraphicBoxCalculator, which aggregate bitmap and SVG elements
List<GraphicObject> figureAnchors = this.initFigureAnchors(doc);

// for each figure anchor, we generate sequence to be labeled with features
Pair<List<String>,List<LayoutTokenization>> featureObjectUp = this.getAreasFeatured(doc, figureAnchors, Direction.UP);
Pair<List<String>,List<LayoutTokenization>> featureObjectDown = this.getAreasFeatured(doc, figureAnchors, Direction.DOWN);
Pair<List<String>,List<LayoutTokenization>> featureObjectUp =
this.getAreasFeatured(doc, figureAnchors, Direction.UP);
Pair<List<String>,List<LayoutTokenization>> featureObjectDown =
this.getAreasFeatured(doc, figureAnchors, Direction.DOWN);

List<String> contentsUp = featureObjectUp.getLeft();
List<String> contentsDown = featureObjectDown.getLeft();
Expand Down Expand Up @@ -132,6 +155,106 @@ public Document extract(Document doc) {
return doc;
}

public Document prepareDocument(Document doc) {

List<LayoutToken> tokenizations = doc.getTokenizations();
if (tokenizations.size() > GrobidProperties.getPdfTokensMax()) {
throw new GrobidException("The document has " + tokenizations.size() + " tokens, but the limit is " + GrobidProperties.getPdfTokensMax(),
GrobidExceptionStatus.TOO_MANY_TOKENS);
}

doc.produceStatistics();
return doc;
}

private void dealWithImages(DocumentSource documentSource, Document doc, File assetFile, GrobidAnalysisConfig config) {
if (assetFile != null) {
// copy the files under the directory pathXML+"_data" (the asset files) into the path specified by assetPath

if (!assetFile.exists()) {
// we create it
if (assetFile.mkdir()) {
LOGGER.debug("Directory created: " + assetFile.getPath());
} else {
LOGGER.error("Failed to create directory: " + assetFile.getPath());
}
}
PNMRegistry.registerAllServicesProviders();

// filter all .jpg and .png files
File directoryPath = new File(documentSource.getXmlFile().getAbsolutePath() + "_data");
if (directoryPath.exists()) {
File[] files = directoryPath.listFiles();
if (files != null) {
int nbFiles = 0;
for (final File currFile : files) {
if (nbFiles > DocumentSource.PDFALTO_FILES_AMOUNT_LIMIT)
break;

String toLowerCaseName = currFile.getName().toLowerCase();
if (toLowerCaseName.endsWith(".png") || !config.isPreprocessImages()) {
try {
if (toLowerCaseName.endsWith(".svg")) {
continue;
}
FileUtils.copyFileToDirectory(currFile, assetFile);
nbFiles++;
} catch (IOException e) {
LOGGER.error("Cannot copy file " + currFile.getAbsolutePath() + " to " + assetFile.getAbsolutePath(), e);
}
} else if (toLowerCaseName.endsWith(".jpg")
|| toLowerCaseName.endsWith(".ppm")
// || currFile.getName().toLowerCase().endsWith(".pbm")
) {

String outputFilePath = "";
try {
final BufferedImage bi = ImageIO.read(currFile);

if (toLowerCaseName.endsWith(".jpg")) {
outputFilePath = assetFile.getPath() + File.separator +
toLowerCaseName.replace(".jpg", ".png");
}
/*else if (currFile.getName().toLowerCase().endsWith(".pbm")) {
outputFilePath = assetFile.getPath() + File.separator +
currFile.getName().toLowerCase().replace(".pbm",".png");
}*/
else {
outputFilePath = assetFile.getPath() + File.separator +
toLowerCaseName.replace(".ppm", ".png");
}
ImageIO.write(bi, "png", new File(outputFilePath));
nbFiles++;
} catch (IOException e) {
LOGGER.error("Cannot convert file " + currFile.getAbsolutePath() + " to " + outputFilePath, e);
}
}
}
}
}
// update the path of the image description stored in Document
if (config.isPreprocessImages()) {
List<GraphicObject> images = doc.getImages();
if (images != null) {
String subPath = assetFile.getPath();
int ind = subPath.lastIndexOf("/");
if (ind != -1)
subPath = subPath.substring(ind + 1, subPath.length());
for (GraphicObject image : images) {
String fileImage = image.getFilePath();
if (fileImage == null) {
continue;
}
fileImage = fileImage.replace(".ppm", ".png")
.replace(".jpg", ".png");
ind = fileImage.indexOf("/");
image.setFilePath(subPath + fileImage.substring(ind, fileImage.length()));
}
}
}
}
}

private List<GraphicObject> initFigureAnchors(Document doc) {
// update images list
List<GraphicObject> figureAnchors = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,6 @@ public Document processing(DocumentSource documentSource,
Document doc = parsers.getFigureSegmenterParser().extract(documentSource, config);
doc = parsers.getSegmentationParser().processing(doc, config);




SortedSet<DocumentPiece> documentBodyParts = doc.getDocumentPart(SegmentationLabels.BODY);

// header processing
Expand Down
Loading

0 comments on commit 8e09ba4

Please sign in to comment.