public class Document
extends java.lang.Object
implements java.io.Serializable
Modifier and Type | Field and Description |
---|---|
protected java.util.List<java.lang.Integer> |
acknowledgementBlocks |
protected Analyzer |
analyzer |
protected int |
beginBody |
protected int |
beginReferences |
protected java.util.List<BibDataSet> |
bibDataSets |
protected java.util.List<java.lang.Integer> |
blockDocumentHeaders |
protected java.util.List<java.lang.Integer> |
blockFigures |
protected java.util.List<java.lang.Integer> |
blockFooters |
protected java.util.List<java.lang.Integer> |
blockHeaders |
protected java.util.List<java.lang.Integer> |
blockHeadFigures |
protected java.util.List<java.lang.Integer> |
blockHeadTables |
protected java.util.SortedSet<DocumentPiece> |
blockReferences |
protected java.util.List<Block> |
blocks |
protected java.util.List<java.lang.Integer> |
blockSectionTitles |
protected java.util.List<java.lang.Integer> |
blockTables |
protected java.util.List<Cluster> |
clusters |
protected int |
documentLenghtChar |
protected DocumentSource |
documentSource |
protected java.util.List<Equation> |
equations |
protected FeatureFactory |
featureFactory |
protected java.util.List<Figure> |
figures |
protected java.util.List<GraphicObject> |
images |
protected <any> |
imagesPerPage |
protected <any> |
labeledBlocks |
protected java.lang.String |
lang |
protected static Logger |
LOGGER |
protected int |
m |
static int |
MAX_FIG_BOX_DISTANCE |
protected double |
maxBlockSpacing |
protected double |
maxCharacterDensity |
protected Metadata |
metadata |
protected static double |
MIN_DISTANCE |
protected double |
minBlockSpacing |
protected double |
minCharacterDensity |
protected static int |
nbBins |
protected DocumentNode |
outlineRoot |
protected java.util.List<Page> |
pages |
protected java.lang.String |
pathXML |
protected java.util.List<PDFAnnotation> |
pdfAnnotations |
protected ReferenceMarkerMatcher |
referenceMarkerMatcher |
protected BiblioItem |
resHeader |
static long |
serialVersionUID |
protected java.util.List<Table> |
tables |
protected java.lang.String |
tei |
protected java.util.Map<java.lang.String,BibDataSet> |
teiIdToBibDataSets |
protected boolean |
titleMatchNum |
protected java.util.List<LayoutToken> |
tokenizations |
protected <any> |
validGraphicObjectPredicate |
Modifier | Constructor and Description |
---|---|
protected |
Document() |
|
Document(DocumentSource documentSource) |
Modifier and Type | Method and Description |
---|---|
void |
addBlock(Block b) |
void |
addPage(Page page) |
java.util.List<LayoutToken> |
addTokenizedDocument(GrobidAnalysisConfig config)
Parser PDFALTO output representation and get the tokenized form of the document.
|
void |
assignGraphicObjectsToFigures() |
void |
calculateTeiIdToBibDataSets() |
static Document |
createFromText(java.lang.String text) |
java.util.List<LayoutToken> |
fromText(java.lang.String text) |
java.lang.String |
getAllBlocksClean(int toIgnore1,
int toIgnore2)
Return all blocks without markers.
|
Analyzer |
getAnalyzer() |
BibDataSet |
getBibDataSetByTeiId(java.lang.String teiId) |
java.util.List<BibDataSet> |
getBibDataSets() |
java.util.List<java.lang.Integer> |
getBlockDocumentHeaders() |
java.util.List<Block> |
getBlocks() |
java.lang.String |
getBody()
Return the text content of the body of the document.
|
java.util.List<Cluster> |
getClusters() |
static java.util.List<GraphicObject> |
getConnectedGraphics(Block block,
Document doc)
Return the list of graphical object touching the given block.
|
protected static int |
getCoordItem(ElementCounter<java.lang.Integer> cnt,
boolean getMin) |
int |
getDocumentLenghtChar() |
java.util.SortedSet<DocumentPiece> |
getDocumentPart(TaggingLabel segmentationLabel)
Get the document part corresponding to a particular segment type
|
java.lang.String |
getDocumentPartText(TaggingLabel segmentationLabel) |
java.lang.String |
getDocumentPieceText(DocumentPiece dp) |
java.lang.String |
getDocumentPieceText(java.util.SortedSet<DocumentPiece> dps) |
java.util.List<LayoutToken> |
getDocumentPieceTokenization(DocumentPiece dp) |
DocumentSource |
getDocumentSource() |
java.util.List<java.lang.String> |
getDOIMatches() |
java.util.List<Equation> |
getEquations() |
protected java.util.List<LayoutToken> |
getFigureLayoutTokens(Figure f) |
java.util.List<Figure> |
getFigures() |
java.lang.String |
getHeader()
heuristics to get the header section...
|
java.lang.String |
getHeaderByIntroduction()
We try to match the introduction section in a safe way, and consider if
minimum requirements are met the blocks before this position as header.
|
java.lang.String |
getHeaderFeatured(boolean getHeader,
boolean withRotation)
Add features in the header section
-> should be moved to the header parser class!
|
java.lang.String |
getHeaderLastHope()
We return the first page as header estimation...
|
java.util.List<GraphicObject> |
getImages() |
<any> |
getLabeledBlocks() |
java.lang.String |
getLanguage() |
double |
getMaxBlockSpacing() |
double |
getMaxCharacterDensity() |
Metadata |
getMetadata() |
double |
getMinBlockSpacing() |
double |
getMinCharacterDensity() |
DocumentNode |
getOutlineRoot() |
Page |
getPage(int num) |
java.util.List<Page> |
getPages() |
java.util.List<PDFAnnotation> |
getPDFAnnotations() |
ReferenceMarkerMatcher |
getReferenceMarkerMatcher() |
BiblioItem |
getResHeader() |
java.util.List<Table> |
getTables() |
java.lang.String |
getTei() |
static java.util.List<LayoutToken> |
getTokenizationParts(java.util.SortedSet<DocumentPiece> documentParts,
java.util.List<LayoutToken> tokenizations)
Give the list of LayoutToken corresponding to some document parts and
a global document tokenization.
|
java.util.List<LayoutToken> |
getTokenizations() |
java.util.List<LayoutToken> |
getTokenizationsFulltext()
Deprecated.
|
java.util.List<LayoutToken> |
getTokenizationsHeader()
Deprecated.
|
java.util.List<LayoutToken> |
getTokenizationsReferences()
Deprecated.
|
static java.util.List<LayoutToken> |
getTokens(java.util.List<LayoutToken> tokenizations,
int offsetBegin,
int offsetEnd) |
static java.util.List<LayoutToken> |
getTokensFrom(java.util.List<LayoutToken> tokenizations,
int offsetBegin,
int offsetEnd,
int startTokenIndex) |
protected java.util.ArrayList<GraphicObject> |
glueImagesIfNecessary(java.lang.Integer pageNum,
java.util.List<GraphicObject> graphicObjects) |
boolean |
isTitleMatchNum() |
protected boolean |
isValidBitmapGraphicObject(GraphicObject go) |
void |
postProcessTables() |
void |
produceStatistics() |
protected void |
recalculateVectorBoxCoords(Figure f,
GraphicObject g) |
void |
setAcknowledgementBlocks(java.util.List<java.lang.Integer> acknowledgementBlocks) |
void |
setAnalyzer(Analyzer analyzer) |
void |
setBibDataSets(java.util.List<BibDataSet> bibDataSets) |
void |
setBlockDocumentHeaders(java.util.List<java.lang.Integer> blockDocumentHeaders) |
void |
setBlockFigures(java.util.List<java.lang.Integer> blockFigures) |
void |
setBlockFooters(java.util.List<java.lang.Integer> blockFooters) |
void |
setBlockHeaders(java.util.List<java.lang.Integer> blockHeaders) |
void |
setBlockHeadFigures(java.util.List<java.lang.Integer> blockHeadFigures) |
void |
setBlockHeadTables(java.util.List<java.lang.Integer> blockHeadTables) |
void |
setBlockReferences(java.util.SortedSet<DocumentPiece> blockReferences) |
void |
setBlockSectionTitles(java.util.List<java.lang.Integer> blockSectionTitles) |
void |
setBlockTables(java.util.List<java.lang.Integer> blockTables) |
void |
setClusters(java.util.List<Cluster> clusters) |
void |
setConnectedGraphics2(Figure figure) |
void |
setEquations(java.util.List<Equation> equations) |
void |
setFigures(java.util.List<Figure> figures) |
void |
setImages(java.util.List<GraphicObject> images) |
void |
setLabeledBlocks(<any> labeledBlocks) |
void |
setLanguage(java.lang.String l) |
void |
setOutlineRoot(DocumentNode outlineRoot) |
void |
setPages(java.util.List<Page> pages) |
protected void |
setPathXML(java.io.File pathXML)
Set the path to the XML file generated by xml2pdf
|
void |
setResHeader(BiblioItem resHeader) |
void |
setTables(java.util.List<Table> tables) |
void |
setTei(java.lang.String tei) |
void |
setTitleMatchNum(boolean titleMatchNum) |
public static final long serialVersionUID
protected static final Logger LOGGER
public static final int MAX_FIG_BOX_DISTANCE
protected final transient DocumentSource documentSource
protected java.lang.String pathXML
protected java.lang.String lang
protected transient java.util.List<Page> pages
protected transient java.util.List<Cluster> clusters
protected transient java.util.List<Block> blocks
protected java.util.List<java.lang.Integer> blockHeaders
protected java.util.List<java.lang.Integer> blockFooters
protected java.util.List<java.lang.Integer> blockSectionTitles
protected java.util.List<java.lang.Integer> acknowledgementBlocks
protected java.util.List<java.lang.Integer> blockDocumentHeaders
protected transient java.util.SortedSet<DocumentPiece> blockReferences
protected java.util.List<java.lang.Integer> blockTables
protected java.util.List<java.lang.Integer> blockFigures
protected java.util.List<java.lang.Integer> blockHeadTables
protected java.util.List<java.lang.Integer> blockHeadFigures
protected transient FeatureFactory featureFactory
protected transient <any> labeledBlocks
protected java.util.List<LayoutToken> tokenizations
protected transient java.util.Map<java.lang.String,BibDataSet> teiIdToBibDataSets
protected transient java.util.List<BibDataSet> bibDataSets
protected transient BiblioItem resHeader
protected java.lang.String tei
protected transient ReferenceMarkerMatcher referenceMarkerMatcher
protected transient java.util.List<GraphicObject> images
protected transient java.util.List<PDFAnnotation> pdfAnnotations
protected transient DocumentNode outlineRoot
protected transient Metadata metadata
protected transient <any> imagesPerPage
protected double maxCharacterDensity
protected double minCharacterDensity
protected double maxBlockSpacing
protected double minBlockSpacing
protected int documentLenghtChar
protected int beginBody
protected int beginReferences
protected boolean titleMatchNum
protected transient java.util.List<Figure> figures
protected transient <any> validGraphicObjectPredicate
protected int m
protected transient java.util.List<Table> tables
protected transient java.util.List<Equation> equations
protected transient Analyzer analyzer
protected static final int nbBins
protected static double MIN_DISTANCE
public Document(DocumentSource documentSource)
protected Document()
public void setImages(java.util.List<GraphicObject> images)
public static Document createFromText(java.lang.String text)
public void setLanguage(java.lang.String l)
public java.lang.String getLanguage()
public BiblioItem getResHeader()
public java.util.List<Block> getBlocks()
public java.util.List<BibDataSet> getBibDataSets()
public void addBlock(Block b)
public java.util.List<GraphicObject> getImages()
public java.util.List<PDFAnnotation> getPDFAnnotations()
public Metadata getMetadata()
protected void setPathXML(java.io.File pathXML)
public java.util.List<LayoutToken> getTokenizations()
public int getDocumentLenghtChar()
public double getMaxCharacterDensity()
public double getMinCharacterDensity()
public double getMaxBlockSpacing()
public double getMinBlockSpacing()
public void setAnalyzer(Analyzer analyzer)
public Analyzer getAnalyzer()
@Deprecated public java.util.List<LayoutToken> getTokenizationsHeader()
@Deprecated public java.util.List<LayoutToken> getTokenizationsFulltext()
@Deprecated public java.util.List<LayoutToken> getTokenizationsReferences()
public java.util.List<LayoutToken> fromText(java.lang.String text)
public java.util.List<LayoutToken> addTokenizedDocument(GrobidAnalysisConfig config)
protected java.util.ArrayList<GraphicObject> glueImagesIfNecessary(java.lang.Integer pageNum, java.util.List<GraphicObject> graphicObjects)
protected static int getCoordItem(ElementCounter<java.lang.Integer> cnt, boolean getMin)
public java.lang.String getHeaderFeatured(boolean getHeader, boolean withRotation)
public java.lang.String getHeader()
public java.lang.String getHeaderLastHope()
public java.lang.String getHeaderByIntroduction()
public java.lang.String getBody()
public java.lang.String getAllBlocksClean(int toIgnore1, int toIgnore2)
public java.util.List<java.lang.String> getDOIMatches()
public java.lang.String getTei()
public void setTei(java.lang.String tei)
public java.util.List<java.lang.Integer> getBlockDocumentHeaders()
public DocumentNode getOutlineRoot()
public void setOutlineRoot(DocumentNode outlineRoot)
public boolean isTitleMatchNum()
public void setTitleMatchNum(boolean titleMatchNum)
public java.util.List<Page> getPages()
public Page getPage(int num)
public java.util.List<Cluster> getClusters()
public void setBlockHeaders(java.util.List<java.lang.Integer> blockHeaders)
public void setBlockFooters(java.util.List<java.lang.Integer> blockFooters)
public void setBlockSectionTitles(java.util.List<java.lang.Integer> blockSectionTitles)
public void setAcknowledgementBlocks(java.util.List<java.lang.Integer> acknowledgementBlocks)
public void setBlockDocumentHeaders(java.util.List<java.lang.Integer> blockDocumentHeaders)
public void setBlockReferences(java.util.SortedSet<DocumentPiece> blockReferences)
public void setBlockTables(java.util.List<java.lang.Integer> blockTables)
public void setBlockFigures(java.util.List<java.lang.Integer> blockFigures)
public void setBlockHeadTables(java.util.List<java.lang.Integer> blockHeadTables)
public void setBlockHeadFigures(java.util.List<java.lang.Integer> blockHeadFigures)
public void setClusters(java.util.List<Cluster> clusters)
public void setPages(java.util.List<Page> pages)
public void addPage(Page page)
public void setBibDataSets(java.util.List<BibDataSet> bibDataSets)
public ReferenceMarkerMatcher getReferenceMarkerMatcher() throws EntityMatcherException
EntityMatcherException
public void calculateTeiIdToBibDataSets()
public <any> getLabeledBlocks()
public void setLabeledBlocks(<any> labeledBlocks)
public java.util.List<LayoutToken> getDocumentPieceTokenization(DocumentPiece dp)
public java.lang.String getDocumentPieceText(DocumentPiece dp)
public java.lang.String getDocumentPieceText(java.util.SortedSet<DocumentPiece> dps)
public java.util.SortedSet<DocumentPiece> getDocumentPart(TaggingLabel segmentationLabel)
public java.lang.String getDocumentPartText(TaggingLabel segmentationLabel)
public static java.util.List<LayoutToken> getTokenizationParts(java.util.SortedSet<DocumentPiece> documentParts, java.util.List<LayoutToken> tokenizations)
public BibDataSet getBibDataSetByTeiId(java.lang.String teiId)
public static java.util.List<GraphicObject> getConnectedGraphics(Block block, Document doc)
public void postProcessTables()
public void assignGraphicObjectsToFigures()
protected boolean isValidBitmapGraphicObject(GraphicObject go)
protected void recalculateVectorBoxCoords(Figure f, GraphicObject g)
protected java.util.List<LayoutToken> getFigureLayoutTokens(Figure f)
public void setConnectedGraphics2(Figure figure)
public void produceStatistics()
public DocumentSource getDocumentSource()
public void setFigures(java.util.List<Figure> figures)
public java.util.List<Figure> getFigures()
public void setTables(java.util.List<Table> tables)
public java.util.List<Table> getTables()
public void setEquations(java.util.List<Equation> equations)
public java.util.List<Equation> getEquations()
public void setResHeader(BiblioItem resHeader)
public static java.util.List<LayoutToken> getTokens(java.util.List<LayoutToken> tokenizations, int offsetBegin, int offsetEnd)
public static java.util.List<LayoutToken> getTokensFrom(java.util.List<LayoutToken> tokenizations, int offsetBegin, int offsetEnd, int startTokenIndex)