public class FullTextParser extends AbstractParser
analyzer, cntManager
Constructor and Description |
---|
FullTextParser(EngineParsers parsers)
TODO some documentation...
|
Modifier and Type | Method and Description |
---|---|
void |
close() |
Document |
createTraining(java.io.File inputFile,
java.lang.String pathFullText,
java.lang.String pathTEI,
int id)
Process the specified pdf and format the result as training data for all the models.
|
static <any> |
getBodyTextFeatured(Document doc,
java.util.SortedSet<DocumentPiece> documentBodyParts) |
static java.util.List<LayoutTokenization> |
getDocumentFullTextTokens(java.util.List<TaggingLabel> labels,
java.lang.String labeledResult,
java.util.List<LayoutToken> tokenizations) |
protected static java.lang.String |
postProcessLabeledAbstract(java.lang.String labeledAbstract) |
Document |
processing(DocumentSource documentSource,
GrobidAnalysisConfig config)
Machine-learning recognition of the complete full text structures.
|
Document |
processing(java.io.File inputPdf,
GrobidAnalysisConfig config) |
<any> |
processShort(java.util.List<LayoutToken> tokens,
Document doc) |
<any> |
processShortNew(java.util.List<LayoutToken> tokens,
Document doc)
Process a simple segment of layout tokens with the full text model.
|
static boolean |
writeField(java.lang.StringBuilder buffer,
java.lang.String s1,
java.lang.String lastTag0,
java.lang.String s2,
java.lang.String field,
java.lang.String outField,
boolean addSpace,
int nbIndent,
boolean generateIDs)
TODO some documentation...
|
static boolean |
writeFieldBeginEnd(java.lang.StringBuilder buffer,
java.lang.String s1,
java.lang.String lastTag0,
java.lang.String s2,
java.lang.String field,
java.lang.String outField,
boolean addSpace,
int nbIndent,
boolean generateIDs)
This is for writing fields for fields where begin and end of field matter, like paragraph or item
|
label, label
public FullTextParser(EngineParsers parsers)
public Document processing(java.io.File inputPdf, GrobidAnalysisConfig config) throws java.lang.Exception
java.lang.Exception
public Document processing(DocumentSource documentSource, GrobidAnalysisConfig config)
documentSource
- inputconfig
- configpublic <any> processShortNew(java.util.List<LayoutToken> tokens, Document doc)
public <any> processShort(java.util.List<LayoutToken> tokens, Document doc)
protected static java.lang.String postProcessLabeledAbstract(java.lang.String labeledAbstract)
public static <any> getBodyTextFeatured(Document doc, java.util.SortedSet<DocumentPiece> documentBodyParts)
public Document createTraining(java.io.File inputFile, java.lang.String pathFullText, java.lang.String pathTEI, int id)
inputFile
- input filepathFullText
- path to fulltextpathTEI
- path to TEIid
- idpublic static boolean writeField(java.lang.StringBuilder buffer, java.lang.String s1, java.lang.String lastTag0, java.lang.String s2, java.lang.String field, java.lang.String outField, boolean addSpace, int nbIndent, boolean generateIDs)
buffer
- buffers1
- lastTag0
- s2
- field
- outField
- addSpace
- nbIndent
- public static boolean writeFieldBeginEnd(java.lang.StringBuilder buffer, java.lang.String s1, java.lang.String lastTag0, java.lang.String s2, java.lang.String field, java.lang.String outField, boolean addSpace, int nbIndent, boolean generateIDs)
buffer
- s1
- lastTag0
- s2
- field
- outField
- addSpace
- nbIndent
- public static java.util.List<LayoutTokenization> getDocumentFullTextTokens(java.util.List<TaggingLabel> labels, java.lang.String labeledResult, java.util.List<LayoutToken> tokenizations)
public void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
close
in class AbstractParser
java.io.IOException