public class Engine
extends java.lang.Object
implements java.io.Closeable
Constructor and Description |
---|
Engine(boolean loadModels)
Constructor for the Grobid engine instance.
|
Modifier and Type | Method and Description |
---|---|
void |
addAcceptedLanguages(java.lang.String lang)
Add a language to the list of accepted languages.
|
java.lang.String |
annotateAllCitationsInPDFPatent(java.lang.String pdfPath,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent
in PDF format.
|
int |
batchCreateTraining(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
Process all the PDF in a given directory with a segmentation process and
produce the corresponding training data format files for manual
correction.
|
int |
batchCreateTrainingBlank(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
Process all the PDF in a given directory with a pdf extraction and
produce blank training data, i.e.
|
int |
batchCreateTrainingMonograph(java.lang.String directoryPath,
java.lang.String resultPath,
int ind)
Process all the PDF in a given directory with a monograph process and
produce the corresponding training data format files for manual
correction.
|
int |
batchCreateTrainingPatentcitations(java.lang.String directoryPath,
java.lang.String resultPath)
Process all the XML patent documents in a given directory with a patent
citation extraction and produce the corresponding training data format
files for manual correction.
|
void |
close() |
void |
createTraining(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
Create training data for all models based on the application of
the current full text model on a new PDF
|
void |
createTrainingBlank(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
Generate blank training data from provided directory of PDF documents, i.e.
|
void |
createTrainingMonograph(java.io.File inputFile,
java.lang.String pathRaw,
java.lang.String pathTEI,
int id)
Create training data for the monograph model based on the application of
the current monograph text model on a new PDF
|
void |
createTrainingPatentCitations(java.lang.String pathXML,
java.lang.String resultPath)
Process an XML patent document with a patent citation extraction and
produce the corresponding training data format files for manual
correction.
|
java.lang.String |
downloadPDF(java.lang.String url,
java.lang.String dirName,
java.lang.String name)
Download a PDF file.
|
java.util.List<ChemicalEntity> |
extractChemicalEntities(java.lang.String text)
Extract chemical names from text.
|
java.lang.String |
fullTextToTEI(java.io.File inputFile,
GrobidAnalysisConfig config)
//TODO: remove invalid JavaDoc once refactoring is done and tested (left for easier reference)
Parse and convert the current article into TEI, this method performs the
whole parsing and conversion process.
|
Document |
fullTextToTEIDoc(DocumentSource documentSource,
GrobidAnalysisConfig config) |
Document |
fullTextToTEIDoc(java.io.File inputFile,
GrobidAnalysisConfig config) |
java.lang.String |
getAbstract(Document doc)
Print the abstract content.
|
java.util.List<java.lang.String> |
getAcceptedLanguages()
Give the list of languages for which an extraction is allowed.
|
static CntManager |
getCntManager() |
static Engine |
getEngine(boolean preload) |
EngineParsers |
getParsers() |
static java.lang.String |
header2BibTeX(BiblioItem resHeader)
Get the BibTeX string corresponding to the recognized header text
|
static java.lang.String |
header2TEI(BiblioItem resHeader)
Get the TEI XML string corresponding to the recognized header text
|
java.lang.String |
printRefTitles(java.util.List<BibDataSet> resBib)
Return all the reference titles.
|
java.util.List<Affiliation> |
processAffiliation(java.lang.String addressBlock)
Parse a text block corresponding to an affiliation+address.
|
java.util.List<java.util.List<Affiliation>> |
processAffiliations(java.util.List<java.lang.String> addressBlocks)
Parse a list of text blocks corresponding to an affiliation+address.
|
java.lang.String |
processAllCitationsInPatent(java.lang.String text,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent text.
|
java.lang.String |
processAllCitationsInPDFPatent(java.lang.String pdfPath,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent
in PDF format.
|
java.lang.String |
processAllCitationsInXMLPatent(java.lang.String xmlPath,
java.util.List<BibDataSet> nplResults,
java.util.List<PatentItem> patentResults,
int consolidateCitations,
boolean includeRawCitations)
Extract and parse both patent and non patent references within a patent in ST.36 format.
|
java.util.List<Person> |
processAuthorsCitation(java.lang.String authorSequence)
Parse a sequence of authors from a citation, i.e.
|
java.util.List<java.util.List<Person>> |
processAuthorsCitationLists(java.util.List<java.lang.String> authorSequences)
Parse a list of independent sequences of authors from citations.
|
java.util.List<Person> |
processAuthorsHeader(java.lang.String authorSequence)
Parse a sequence of authors from a header, i.e.
|
java.util.List<Date> |
processDate(java.lang.String dateBlock)
Parse a raw string containing dates.
|
java.lang.String |
processHeader(java.lang.String inputFile,
GrobidAnalysisConfig config,
BiblioItem result) |
java.lang.String |
processHeader(java.lang.String inputFile,
int consolidate,
BiblioItem result)
Apply a parsing model for the header of a PDF file based on CRF, using
dynamic range of pages as header
|
java.lang.String |
processHeader(java.lang.String inputFile,
int consolidate,
boolean includeRawAffiliations,
BiblioItem result)
Apply a parsing model for the header of a PDF file based on CRF, using
first three pages of the PDF
|
BiblioItem |
processRawReference(java.lang.String reference,
int consolidate)
Apply a parsing model for a given single raw reference string based on CRF
|
java.util.List<BiblioItem> |
processRawReferences(java.util.List<java.lang.String> references,
int consolidate)
Apply a parsing model for a set of raw reference text based on CRF
|
java.util.List<BibDataSet> |
processReferences(java.io.File inputFile,
int consolidate)
Apply a parsing model to the reference block of a PDF file based on CRF
|
static java.lang.String |
reference2BibTeX(java.lang.String path,
java.util.List<BibDataSet> resBib,
int i)
Get the BibTeX string corresponding to the recognized citation section
for a given citation
|
static java.lang.String |
reference2TEI(java.lang.String path,
java.util.List<BibDataSet> resBib,
int i)
Get the TEI XML string corresponding to the recognized citation section
for a particular citation
|
java.lang.String |
references2BibTeX(java.lang.String path,
java.util.List<BibDataSet> resBib)
Get the BibTeX string corresponding to the recognized citation section
|
static java.lang.String |
references2TEI(java.lang.String path,
java.util.List<BibDataSet> resBib)
Get the TEI XML string corresponding to the recognized citation section,
with pointers and advanced structuring
|
static java.lang.String |
references2TEI2(java.lang.String path,
java.util.List<BibDataSet> resBib)
Get the TEI XML string corresponding to the recognized citation section
|
Language |
runLanguageId(java.lang.String filePath)
Basic run for language identification, default is on the body of the
current document.
|
Language |
runLanguageId(java.lang.String filePath,
java.lang.String ext)
Perform a language identification
|
java.lang.String |
segmentAndProcessHeader(java.io.File inputFile,
int consolidate,
BiblioItem result)
Use the segmentation model to identify the header section of a PDF file, then apply a parsing model for the
header based on CRF
|
static void |
setCntManager(CntManager cntManager) |
public Engine(boolean loadModels)
public java.util.List<Person> processAuthorsHeader(java.lang.String authorSequence) throws java.lang.Exception
authorSequence
- - the string corresponding to a raw sequence of namesjava.lang.Exception
public java.util.List<Person> processAuthorsCitation(java.lang.String authorSequence) throws java.lang.Exception
authorSequence
- - the string corresponding to a raw sequence of namesjava.lang.Exception
public java.util.List<java.util.List<Person>> processAuthorsCitationLists(java.util.List<java.lang.String> authorSequences) throws java.lang.Exception
authorSequences
- - the list of strings corresponding each to a raw sequence of
names.java.lang.Exception
public java.util.List<Affiliation> processAffiliation(java.lang.String addressBlock) throws java.io.IOException
addressBlock
- - the string corresponding to a raw affiliation+addressjava.io.IOException
public java.util.List<java.util.List<Affiliation>> processAffiliations(java.util.List<java.lang.String> addressBlocks) throws java.lang.Exception
addressBlocks
- - the list of strings corresponding each to a raw
affiliation+address.java.lang.Exception
public java.util.List<Date> processDate(java.lang.String dateBlock) throws java.io.IOException
dateBlock
- - the string containing raw dates.java.io.IOException
public BiblioItem processRawReference(java.lang.String reference, int consolidate)
reference
- the reference string to be processedconsolidate
- the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)public java.util.List<BiblioItem> processRawReferences(java.util.List<java.lang.String> references, int consolidate) throws java.lang.Exception
references
- the list of raw reference strings to be processedconsolidate
- the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception
public java.util.List<BibDataSet> processReferences(java.io.File inputFile, int consolidate)
inputFile
- the path of the PDF file to be processedconsolidate
- the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)public java.lang.String downloadPDF(java.lang.String url, java.lang.String dirName, java.lang.String name)
url
- URL of the PDF to downloaddirName
- directory where to store the downloaded PDFname
- file namepublic java.util.List<java.lang.String> getAcceptedLanguages()
public void addAcceptedLanguages(java.lang.String lang)
lang
- the language in ISO 3166 to be addedpublic Language runLanguageId(java.lang.String filePath, java.lang.String ext)
ext
- partpublic Language runLanguageId(java.lang.String filePath)
public java.lang.String processHeader(java.lang.String inputFile, int consolidate, boolean includeRawAffiliations, BiblioItem result)
inputFile
- the path of the PDF file to be processedconsolidate
- the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)result
- bib resultpublic java.lang.String processHeader(java.lang.String inputFile, int consolidate, BiblioItem result)
inputFile
- : the path of the PDF file to be processedresult
- bib resultpublic java.lang.String processHeader(java.lang.String inputFile, GrobidAnalysisConfig config, BiblioItem result)
public java.lang.String segmentAndProcessHeader(java.io.File inputFile, int consolidate, BiblioItem result)
inputFile
- the path of the PDF file to be processedconsolidate
- the consolidation option allows GROBID to exploit Crossref web services for improving header
information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra
metadata) or 2 (consolidate the citation and inject DOI only)result
- bib resultpublic void createTrainingMonograph(java.io.File inputFile, java.lang.String pathRaw, java.lang.String pathTEI, int id)
inputFile
- : the path of the PDF file to be processedpathRaw
- : the path where to put the CRF feature filepathTEI
- : the path where to put the annotated TEI representation (the
file to be corrected for gold-level training data)id
- : an optional ID to be used in the TEI file and the full text
file, -1 if not usedpublic void createTrainingBlank(java.io.File inputFile, java.lang.String pathRaw, java.lang.String pathTEI, int id)
inputFile
- : the path of the PDF file to be processedpathRaw
- : the path where to put the CRF feature filepathTEI
- : the path where to put the annotated TEI representation (the
file to be annotated for "from scratch" training data)id
- : an optional ID to be used in the TEI file and the full text
file, -1 if not usedpublic void createTraining(java.io.File inputFile, java.lang.String pathRaw, java.lang.String pathTEI, int id)
inputFile
- : the path of the PDF file to be processedpathRaw
- : the path where to put the CRF feature filepathTEI
- : the path where to put the annotated TEI representation (the
file to be corrected for gold-level training data)id
- : an optional ID to be used in the TEI file, -1 if not usedpublic java.lang.String fullTextToTEI(java.io.File inputFile, GrobidAnalysisConfig config) throws java.lang.Exception
inputFile
- - absolute path to the pdf to be processedconfig
- - Grobid configjava.lang.Exception
public Document fullTextToTEIDoc(java.io.File inputFile, GrobidAnalysisConfig config) throws java.lang.Exception
java.lang.Exception
public Document fullTextToTEIDoc(DocumentSource documentSource, GrobidAnalysisConfig config) throws java.lang.Exception
java.lang.Exception
public int batchCreateTraining(java.lang.String directoryPath, java.lang.String resultPath, int ind)
directoryPath
- - the path to the directory containing PDF to be processed.resultPath
- - the path to the directory where the results as XML files
shall be written.ind
- - identifier integer to be included in the resulting files to
identify the training case. This is optional: no identifier
will be included if ind = -1public int batchCreateTrainingMonograph(java.lang.String directoryPath, java.lang.String resultPath, int ind)
directoryPath
- - the path to the directory containing PDF to be processed.resultPath
- - the path to the directory where the results as XML files
and CRF feature files shall be written.ind
- - identifier integer to be included in the resulting files to
identify the training case. This is optional: no identifier
will be included if ind = -1public int batchCreateTrainingBlank(java.lang.String directoryPath, java.lang.String resultPath, int ind)
directoryPath
- - the path to the directory containing PDF to be processed.resultPath
- - the path to the directory where the results as XML files
and default CRF feature files shall be written.ind
- - identifier integer to be included in the resulting files to
identify the training case. This is optional: no identifier
will be included if ind = -1public static java.lang.String header2TEI(BiblioItem resHeader)
public static java.lang.String header2BibTeX(BiblioItem resHeader)
public static java.lang.String references2TEI2(java.lang.String path, java.util.List<BibDataSet> resBib)
public static java.lang.String references2TEI(java.lang.String path, java.util.List<BibDataSet> resBib)
public java.lang.String references2BibTeX(java.lang.String path, java.util.List<BibDataSet> resBib)
public static java.lang.String reference2TEI(java.lang.String path, java.util.List<BibDataSet> resBib, int i)
public static java.lang.String reference2BibTeX(java.lang.String path, java.util.List<BibDataSet> resBib, int i)
public java.lang.String processAllCitationsInPatent(java.lang.String text, java.util.List<BibDataSet> nplResults, java.util.List<PatentItem> patentResults, int consolidateCitations, boolean includeRawCitations) throws java.lang.Exception
text
- the string corresponding to the text body of the patent.nplResults
- the list of extracted and parsed non patent references as BiblioItem object. This
list must be instantiated before calling the method for receiving the results.patentResults
- the list of extracted and parsed patent references as PatentItem object. This list
must be instantiated before calling the method for receiving the results.consolidateCitations
- the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception
public java.lang.String processAllCitationsInXMLPatent(java.lang.String xmlPath, java.util.List<BibDataSet> nplResults, java.util.List<PatentItem> patentResults, int consolidateCitations, boolean includeRawCitations) throws java.lang.Exception
nplResults
- the list of extracted and parsed non patent references as BiblioItem object. This
list must be instanciated before calling the method for receiving the results.patentResults
- the list of extracted and parsed patent references as PatentItem object. This list
must be instanciated before calling the method for receiving the results.consolidateCitations
- the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception
- if sth. went wrongpublic java.lang.String processAllCitationsInPDFPatent(java.lang.String pdfPath, java.util.List<BibDataSet> nplResults, java.util.List<PatentItem> patentResults, int consolidateCitations, boolean includeRawCitations) throws java.lang.Exception
pdfPath
- pdf pathnplResults
- the list of extracted and parsed non patent references as
BiblioItem object. This list must be instanciated before
calling the method for receiving the results.patentResults
- the list of extracted and parsed patent references as
PatentItem object. This list must be instanciated before
calling the method for receiving the results.consolidateCitations
- the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception
- if sth. went wrongpublic java.lang.String annotateAllCitationsInPDFPatent(java.lang.String pdfPath, int consolidateCitations, boolean includeRawCitations) throws java.lang.Exception
pdfPath
- pdf pathconsolidateCitations
- the consolidation option allows GROBID to exploit Crossref web services for improving
header information. 0 (no consolidation, default value), 1 (consolidate the citation
and inject extra metadata) or 2 (consolidate the citation and inject DOI only)java.lang.Exception
public void createTrainingPatentCitations(java.lang.String pathXML, java.lang.String resultPath) throws java.lang.Exception
pathXML
- - the path to the XML patent document to be processed.resultPath
- - the path to the directory where the results as XML files
shall be written.java.lang.Exception
public int batchCreateTrainingPatentcitations(java.lang.String directoryPath, java.lang.String resultPath) throws java.lang.Exception
directoryPath
- - the path to the directory containing XML files to be
processed.resultPath
- - the path to the directory where the results as XML files
shall be written.java.lang.Exception
public java.util.List<ChemicalEntity> extractChemicalEntities(java.lang.String text) throws java.lang.Exception
text
- - text to be processed.java.lang.Exception
public java.lang.String getAbstract(Document doc) throws java.lang.Exception
java.lang.Exception
public java.lang.String printRefTitles(java.util.List<BibDataSet> resBib) throws java.lang.Exception
java.lang.Exception
public void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
java.io.IOException
public static void setCntManager(CntManager cntManager)
public static CntManager getCntManager()
public EngineParsers getParsers()
public static Engine getEngine(boolean preload)