public class HeaderParser extends AbstractParser
analyzer, cntManager
Constructor and Description |
---|
HeaderParser(EngineParsers parsers) |
HeaderParser(EngineParsers parsers,
CntManager cntManager) |
Modifier and Type | Method and Description |
---|---|
void |
close() |
BiblioItem |
consolidateHeader(BiblioItem resHeader,
int consolidate)
Consolidate an existing list of recognized citations based on access to
external internet bibliographic databases.
|
Document |
createTrainingHeader(java.lang.String inputFile,
java.lang.String pathHeader,
java.lang.String pathTEI)
Process the header of the specified pdf and format the result as training
data.
|
<any> |
getSectionHeaderFeatured(Document doc,
java.util.SortedSet<DocumentPiece> documentHeaderParts,
boolean withRotation)
Return the header section with features to be processed by the CRF model
|
<any> |
processing(java.io.File input,
BiblioItem resHeader,
GrobidAnalysisConfig config)
Processing with application of the segmentation model
|
<any> |
processing2(java.lang.String pdfInput,
BiblioItem resHeader,
GrobidAnalysisConfig config)
Processing without application of the segmentation model, regex are used to identify the header
zone.
|
java.lang.String |
processingHeaderBlock(GrobidAnalysisConfig config,
Document doc,
BiblioItem resHeader)
Header processing after identification of the header blocks with heuristics (old approach)
|
java.lang.String |
processingHeaderSection(GrobidAnalysisConfig config,
Document doc,
BiblioItem resHeader)
Header processing after application of the segmentation model (new approach)
|
BiblioItem |
resultExtraction(java.lang.String result,
boolean intro,
java.util.List<LayoutToken> tokenizations,
BiblioItem biblio,
Document doc)
Extract results from a labelled header.
|
java.lang.StringBuilder |
trainingExtraction(java.lang.String result,
boolean intro,
java.util.List<LayoutToken> tokenizations)
Extract results from a labelled header in the training format without any
string modification.
|
label, label
public HeaderParser(EngineParsers parsers, CntManager cntManager)
public HeaderParser(EngineParsers parsers)
public <any> processing(java.io.File input, BiblioItem resHeader, GrobidAnalysisConfig config)
public <any> processing2(java.lang.String pdfInput, BiblioItem resHeader, GrobidAnalysisConfig config)
public java.lang.String processingHeaderBlock(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader) throws java.lang.Exception
java.lang.Exception
public java.lang.String processingHeaderSection(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader)
public <any> getSectionHeaderFeatured(Document doc, java.util.SortedSet<DocumentPiece> documentHeaderParts, boolean withRotation)
public Document createTrainingHeader(java.lang.String inputFile, java.lang.String pathHeader, java.lang.String pathTEI)
inputFile
- path to input filepathHeader
- path to headerpathTEI
- path to TEIpublic BiblioItem resultExtraction(java.lang.String result, boolean intro, java.util.List<LayoutToken> tokenizations, BiblioItem biblio, Document doc)
result
- resultintro
- if introtokenizations
- list of tokensbiblio
- biblio itempublic java.lang.StringBuilder trainingExtraction(java.lang.String result, boolean intro, java.util.List<LayoutToken> tokenizations)
result
- resultintro
- if introtokenizations
- list of tokenspublic BiblioItem consolidateHeader(BiblioItem resHeader, int consolidate)
resHeader
- original biblio itempublic void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
close
in class AbstractParser
java.io.IOException