public class Engine
extends java.lang.Object
implements java.io.Closeable
| Constructor and Description |
|---|
Engine(boolean loadModels)
Constructor for the Grobid engine instance.
|
| Modifier and Type | Method and Description |
|---|---|
void |
addAcceptedLanguages(java.lang.String lang)
Add a language to the list of accepted languages.
|
java.lang.String |
annotateAllCitationsInPDFPatent(java.lang.String pdfPath,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent
in PDF format.
|
int |
batchCreateTraining(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
Process all the PDF in a given directory with a segmentation process and
produce the corresponding training data format files for manual
correction.
|
int |
batchCreateTrainingBlank(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
Process all the PDF in a given directory with a pdf extraction and
produce blank training data, i.e.
|
int |
batchCreateTrainingMonograph(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
Process all the PDF in a given directory with a monograph process and
produce the corresponding training data format files for manual
correction.
|
int |
batchCreateTrainingPatentcitations(java.lang.String directoryPath,
java.lang.String resultPath)
Process all the XML patent documents in a given directory with a patent
citation extraction and produce the corresponding training data format
files for manual correction.
|
void |
close() |
void |
createTraining(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
Create training data for all models based on the application of
the current full text model on a new PDF
|
void |
createTrainingBlank(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
Generate blank training data from provided directory of PDF documents, i.e.
|
void |
createTrainingMonograph(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
Create training data for the monograph model based on the application of
the current monograph text model on a new PDF
|
void |
createTrainingPatentCitations(java.lang.String pathXML,
java.lang.String resultPath)
Process an XML patent document with a patent citation extraction and
produce the corresponding training data format files for manual
correction.
|
java.lang.String |
downloadPDF(java.lang.String url,
java.lang.String dirName,
java.lang.String name)
Download a PDF file.
|
java.util.List<ChemicalEntity> |
extractChemicalEntities(java.lang.String text)
Extract chemical names from text.
|
java.lang.String |
fullTextToTEI(java.io.File inputFile,
GrobidAnalysisConfig config)
//TODO: remove invalid JavaDoc once refactoring is done and tested (left for easier reference)
Parse and convert the current article into TEI, this method performs the
whole parsing and conversion process.
|
Document |
fullTextToTEIDoc(DocumentSource documentSource,
GrobidAnalysisConfig config) |
Document |
fullTextToTEIDoc(java.io.File inputFile,
GrobidAnalysisConfig config) |
java.lang.String |
getAbstract(Document doc)
Print the abstract content.
|
java.util.List<java.lang.String> |
getAcceptedLanguages()
Give the list of languages for which an extraction is allowed.
|
static CntManager |
getCntManager() |
static Engine |
getEngine(boolean preload) |
EngineParsers |
getParsers() |
static java.lang.String |
header2BibTeX(BiblioItem resHeader)
Get the BibTeX string corresponding to the recognized header text
|
static java.lang.String |
header2TEI(BiblioItem resHeader)
Get the TEI XML string corresponding to the recognized header text
|
java.lang.String |
printRefTitles(java.util.List<BibDataSet> resBib)
Return all the reference titles.
|
java.util.List<Affiliation> |
processAffiliation(java.lang.String addressBlock)
Parse a text block corresponding to an affiliation+address.
|
java.util.List<java.util.List<Affiliation>> |
processAffiliations(java.util.List<java.lang.String> addressBlocks)
Parse a list of text blocks corresponding to an affiliation+address.
|
java.lang.String |
processAllCitationsInPatent(java.lang.String text,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent text.
|
java.lang.String |
processAllCitationsInPDFPatent(java.lang.String pdfPath,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent
in PDF format.
|
java.lang.String |
processAllCitationsInXMLPatent(java.lang.String xmlPath,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent in ST.36 format.
|
java.util.List<Person> |
processAuthorsCitation(java.lang.String authorSequence)
Parse a sequence of authors from a citation, i.e.
|
java.util.List<java.util.List<Person>> |
processAuthorsCitationLists(java.util.List<java.lang.String> authorSequences)
Parse a list of independent sequences of authors from citations.
|
java.util.List<Person> |
processAuthorsHeader(java.lang.String authorSequence)
Parse a sequence of authors from a header, i.e.
|
java.util.List<Date> |
processDate(java.lang.String dateBlock)
Parse a raw string containing dates.
|
java.lang.String |
processHeader(java.lang.String inputFile,
GrobidAnalysisConfig config,
BiblioItem result) |
java.lang.String |
processHeader(java.lang.String inputFile,
int consolidate,
BiblioItem result)
Apply a parsing model for the header of a PDF file based on CRF, using
dynamic range of pages as header
|
java.lang.String |
processHeader(java.lang.String inputFile,
int consolidate,
boolean includeRawAffiliations,
BiblioItem result)
Apply a parsing model for the header of a PDF file based on CRF, using
first three pages of the PDF
|
BiblioItem |
processRawReference(java.lang.String reference,
int consolidate)
Apply a parsing model for a given single raw reference string based on CRF
|
java.util.List<BiblioItem> |
processRawReferences(java.util.List<java.lang.String> references,
int consolidate)
Apply a parsing model for a set of raw reference text based on CRF
|
java.util.List<BibDataSet> |
processReferences(java.io.File inputFile,
int consolidate)
Apply a parsing model to the reference block of a PDF file based on CRF
|
static java.lang.String |
reference2BibTeX(java.lang.String path,
java.util.List<BibDataSet> resBib,
int i)
Get the BibTeX string corresponding to the recognized citation section
for a given citation
|
static java.lang.String |
reference2TEI(java.lang.String path,
java.util.List<BibDataSet> resBib,
int i)
Get the TEI XML string corresponding to the recognized citation section
for a particular citation
|
java.lang.String |
references2BibTeX(java.lang.String path,
java.util.List<BibDataSet> resBib)
Get the BibTeX string corresponding to the recognized citation section
|
static java.lang.String |
references2TEI(java.lang.String path,
java.util.List<BibDataSet> resBib)
Get the TEI XML string corresponding to the recognized citation section,
with pointers and advanced structuring
|
static java.lang.String |
references2TEI2(java.lang.String path,
java.util.List<BibDataSet> resBib)
Get the TEI XML string corresponding to the recognized citation section
|
Language |
runLanguageId(java.lang.String filePath)
Basic run for language identification, default is on the body of the
current document.
|
Language |
runLanguageId(java.lang.String filePath,
java.lang.String ext)
Perform a language identification
|
java.lang.String |
segmentAndProcessHeader(java.io.File inputFile,
int consolidate,
BiblioItem result)
Use the segmentation model to identify the header section of a PDF file, then apply a parsing model for the
header based on CRF
|
static void |
setCntManager(CntManager cntManager) |
public Engine(boolean loadModels)
public java.util.List<Person> processAuthorsHeader(java.lang.String authorSequence) throws java.lang.Exception
authorSequence - - the string corresponding to a raw sequence of namesjava.lang.Exceptionpublic java.util.List<Person> processAuthorsCitation(java.lang.String authorSequence) throws java.lang.Exception
authorSequence - - the string corresponding to a raw sequence of namesjava.lang.Exceptionpublic java.util.List<java.util.List<Person>> processAuthorsCitationLists(java.util.List<java.lang.String> authorSequences) throws java.lang.Exception
authorSequences - - the list of strings corresponding each to a raw sequence of
names.java.lang.Exceptionpublic java.util.List<Affiliation> processAffiliation(java.lang.String addressBlock) throws java.io.IOException
addressBlock - - the string corresponding to a raw affiliation+addressjava.io.IOExceptionpublic java.util.List<java.util.List<Affiliation>> processAffiliations(java.util.List<java.lang.String> addressBlocks) throws java.lang.Exception
addressBlocks - - the list of strings corresponding each to a raw
affiliation+address.java.lang.Exceptionpublic java.util.List<Date> processDate(java.lang.String dateBlock) throws java.io.IOException
dateBlock - - the string containing raw dates.java.io.IOExceptionpublic BiblioItem processRawReference(java.lang.String reference, int consolidate)
reference - the reference string to be processedconsolidate - the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)public java.util.List<BiblioItem> processRawReferences(java.util.List<java.lang.String> references, int consolidate) throws java.lang.Exception
references - the list of raw reference strings to be processedconsolidate - the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exceptionpublic java.util.List<BibDataSet> processReferences(java.io.File inputFile, int consolidate)
inputFile - the path of the PDF file to be processedconsolidate - the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)public java.lang.String downloadPDF(java.lang.String url,
java.lang.String dirName,
java.lang.String name)
url - URL of the PDF to downloaddirName - directory where to store the downloaded PDFname - file namepublic java.util.List<java.lang.String> getAcceptedLanguages()
public void addAcceptedLanguages(java.lang.String lang)
lang - the language in ISO 3166 to be addedpublic Language runLanguageId(java.lang.String filePath, java.lang.String ext)
ext - partpublic Language runLanguageId(java.lang.String filePath)
public java.lang.String processHeader(java.lang.String inputFile,
int consolidate,
boolean includeRawAffiliations,
BiblioItem result)
inputFile - the path of the PDF file to be processedconsolidate - the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)result - bib resultpublic java.lang.String processHeader(java.lang.String inputFile,
int consolidate,
BiblioItem result)
inputFile - : the path of the PDF file to be processedresult - bib resultpublic java.lang.String processHeader(java.lang.String inputFile,
GrobidAnalysisConfig config,
BiblioItem result)
public java.lang.String segmentAndProcessHeader(java.io.File inputFile,
int consolidate,
BiblioItem result)
inputFile - the path of the PDF file to be processedconsolidate - the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)result - bib resultpublic void createTrainingMonograph(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
inputFile - : the path of the PDF file to be processedpathRaw - : the path where to put the CRF feature filepathTEI - : the path where to put the annotated TEI representation (the
file to be corrected for gold-level training data)id - : an optional ID to be used in the TEI file and the full text
file, -1 if not usedpublic void createTrainingBlank(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
inputFile - : the path of the PDF file to be processedpathRaw - : the path where to put the CRF feature filepathTEI - : the path where to put the annotated TEI representation (the
file to be annotated for "from scratch" training data)id - : an optional ID to be used in the TEI file and the full text
file, -1 if not usedpublic void createTraining(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
inputFile - : the path of the PDF file to be processedpathRaw - : the path where to put the CRF feature filepathTEI - : the path where to put the annotated TEI representation (the
file to be corrected for gold-level training data)id - : an optional ID to be used in the TEI file, -1 if not usedpublic java.lang.String fullTextToTEI(java.io.File inputFile,
GrobidAnalysisConfig config)
throws java.lang.Exception
inputFile - - absolute path to the pdf to be processedconfig - - Grobid configjava.lang.Exceptionpublic Document fullTextToTEIDoc(java.io.File inputFile, GrobidAnalysisConfig config) throws java.lang.Exception
java.lang.Exceptionpublic Document fullTextToTEIDoc(DocumentSource documentSource, GrobidAnalysisConfig config) throws java.lang.Exception
java.lang.Exceptionpublic int batchCreateTraining(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
directoryPath - - the path to the directory containing PDF to be processed.resultPath - - the path to the directory where the results as XML files
shall be written.ind - - identifier integer to be included in the resulting files to
identify the training case. This is optional: no identifier
will be included if ind = -1public int batchCreateTrainingMonograph(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
directoryPath - - the path to the directory containing PDF to be processed.resultPath - - the path to the directory where the results as XML files
and CRF feature files shall be written.ind - - identifier integer to be included in the resulting files to
identify the training case. This is optional: no identifier
will be included if ind = -1public int batchCreateTrainingBlank(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
directoryPath - - the path to the directory containing PDF to be processed.resultPath - - the path to the directory where the results as XML files
and default CRF feature files shall be written.ind - - identifier integer to be included in the resulting files to
identify the training case. This is optional: no identifier
will be included if ind = -1public static java.lang.String header2TEI(BiblioItem resHeader)
public static java.lang.String header2BibTeX(BiblioItem resHeader)
public static java.lang.String references2TEI2(java.lang.String path,
java.util.List<BibDataSet> resBib)
public static java.lang.String references2TEI(java.lang.String path,
java.util.List<BibDataSet> resBib)
public java.lang.String references2BibTeX(java.lang.String path,
java.util.List<BibDataSet> resBib)
public static java.lang.String reference2TEI(java.lang.String path,
java.util.List<BibDataSet> resBib,
int i)
public static java.lang.String reference2BibTeX(java.lang.String path,
java.util.List<BibDataSet> resBib,
int i)
public java.lang.String processAllCitationsInPatent(java.lang.String text,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
throws java.lang.Exception
text - the string corresponding to the text body of the patent.nplResults - the list of extracted and parsed non patent references as BiblioItem object. This
list must be instantiated before calling the method for receiving the results.patentResults - the list of extracted and parsed patent references as PatentItem object. This list
must be instantiated before calling the method for receiving the results.consolidateCitations - the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exceptionpublic java.lang.String processAllCitationsInXMLPatent(java.lang.String xmlPath,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
throws java.lang.Exception
nplResults - the list of extracted and parsed non patent references as BiblioItem object. This
list must be instanciated before calling the method for receiving the results.patentResults - the list of extracted and parsed patent references as PatentItem object. This list
must be instanciated before calling the method for receiving the results.consolidateCitations - the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception - if sth. went wrongpublic java.lang.String processAllCitationsInPDFPatent(java.lang.String pdfPath,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
throws java.lang.Exception
pdfPath - pdf pathnplResults - the list of extracted and parsed non patent references as
BiblioItem object. This list must be instanciated before
calling the method for receiving the results.patentResults - the list of extracted and parsed patent references as
PatentItem object. This list must be instanciated before
calling the method for receiving the results.consolidateCitations - the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception - if sth. went wrongpublic java.lang.String annotateAllCitationsInPDFPatent(java.lang.String pdfPath,
int consolidateCitations,
boolean includeRawCitations)
throws java.lang.Exception
pdfPath - pdf pathconsolidateCitations - the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exceptionpublic void createTrainingPatentCitations(java.lang.String pathXML,
java.lang.String resultPath)
throws java.lang.Exception
pathXML - - the path to the XML patent document to be processed.resultPath - - the path to the directory where the results as XML files
shall be written.java.lang.Exceptionpublic int batchCreateTrainingPatentcitations(java.lang.String directoryPath,
java.lang.String resultPath)
throws java.lang.Exception
directoryPath - - the path to the directory containing XML files to be
processed.resultPath - - the path to the directory where the results as XML files
shall be written.java.lang.Exceptionpublic java.util.List<ChemicalEntity> extractChemicalEntities(java.lang.String text) throws java.lang.Exception
text - - text to be processed.java.lang.Exceptionpublic java.lang.String getAbstract(Document doc) throws java.lang.Exception
java.lang.Exceptionpublic java.lang.String printRefTitles(java.util.List<BibDataSet> resBib) throws java.lang.Exception
java.lang.Exceptionpublic void close()
throws java.io.IOException
close in interface java.io.Closeableclose in interface java.lang.AutoCloseablejava.io.IOExceptionpublic static void setCntManager(CntManager cntManager)
public static CntManager getCntManager()
public EngineParsers getParsers()
public static Engine getEngine(boolean preload)