public class ReferenceExtractor
extends java.lang.Object
implements java.io.Closeable
Modifier and Type | Field and Description |
---|---|
java.lang.String |
currentPatentNumber |
boolean |
debug |
Lexicon |
lexicon |
OPSService |
ops |
java.util.ArrayList<BibDataSet> |
resBib |
Constructor and Description |
---|
ReferenceExtractor() |
ReferenceExtractor(EngineParsers parsers) |
Modifier and Type | Method and Description |
---|---|
java.lang.String |
annotateAllReferences(Document doc,
java.util.List<LayoutToken> tokenizations,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents,
java.util.List<BibDataSet> articles)
Annotate all reference from a list of layout tokens.
|
java.lang.String |
annotateAllReferencesPDFFile(java.lang.String inputFile,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents,
java.util.List<BibDataSet> articles)
JSON annotations for all reference from the PDF file of a patent publication.
|
void |
close() |
java.lang.String |
extractAllReferencesOPS(boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents,
java.util.List<BibDataSet> articles)
Extract all reference from the full text retrieve via OPS.
|
java.lang.String |
extractAllReferencesPDFFile(java.lang.String inputFile,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents,
java.util.List<BibDataSet> articles)
Extract all reference from the PDF file of a patent publication.
|
java.lang.String |
extractAllReferencesString(java.lang.String text,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents,
java.util.List<BibDataSet> articles)
Extract all reference from a simple piece of text.
|
java.lang.String |
extractAllReferencesXMLFile(java.lang.String pathXML,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents,
java.util.List<BibDataSet> articles)
Extract all reference from an XML file in ST.36 or MAREC format.
|
java.lang.String |
extractPatentReferencesXMLFile(java.lang.String pathXML,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
java.util.List<PatentItem> patents)
Extract all reference from a patent in XML ST.36 like.
|
void |
generateTrainingData(java.lang.String documentPath,
java.lang.String newTrainingPath)
Annotate a new XML patent document based on training data format with the current model.
|
void |
generateXMLReport(java.io.File file,
java.util.ArrayList<PatentItem> patents,
java.util.ArrayList<BibDataSet> articles)
Write the list of extracted references in an XML file
|
boolean |
getDocOPS(java.lang.String number)
Get a patent description by its number and OPS
|
java.lang.String |
reference2BibTeX(int i)
Get the BibTeX string corresponding to the recognized citation section
for a given citation
|
java.lang.String |
reference2TEI(int i)
Get the TEI XML string corresponding to the recognized citation section for
a particular citation
|
java.lang.String |
references2BibTeX()
Get the BibTeX string corresponding to the recognized citation section
|
java.lang.String |
references2TEI()
Get the TEI XML string corresponding to the recognized citation section,
with pointers and advanced structuring
|
java.lang.String |
references2TEI2()
Get the TEI XML string corresponding to the recognized citation section
|
void |
setDocumentPath(java.lang.String dirName) |
public boolean debug
public Lexicon lexicon
public java.lang.String currentPatentNumber
public OPSService ops
public java.util.ArrayList<BibDataSet> resBib
public ReferenceExtractor()
public ReferenceExtractor(EngineParsers parsers)
public void setDocumentPath(java.lang.String dirName)
public java.lang.String extractAllReferencesOPS(boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents, java.util.List<BibDataSet> articles)
public java.lang.String extractPatentReferencesXMLFile(java.lang.String pathXML, boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents)
public java.lang.String extractAllReferencesXMLFile(java.lang.String pathXML, boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents, java.util.List<BibDataSet> articles)
public java.lang.String extractAllReferencesPDFFile(java.lang.String inputFile, boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents, java.util.List<BibDataSet> articles)
public java.lang.String annotateAllReferencesPDFFile(java.lang.String inputFile, boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents, java.util.List<BibDataSet> articles)
public java.lang.String extractAllReferencesString(java.lang.String text, boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents, java.util.List<BibDataSet> articles)
public java.lang.String annotateAllReferences(Document doc, java.util.List<LayoutToken> tokenizations, boolean filterDuplicate, int consolidate, boolean includeRawCitations, java.util.List<PatentItem> patents, java.util.List<BibDataSet> articles)
public java.lang.String references2TEI2()
public java.lang.String reference2TEI(int i)
public java.lang.String references2BibTeX()
public java.lang.String references2TEI()
public java.lang.String reference2BibTeX(int i)
public void generateTrainingData(java.lang.String documentPath, java.lang.String newTrainingPath)
documentPath
- is the path to the file to be processednewTrainingPath
- new training pathpublic boolean getDocOPS(java.lang.String number)
public void generateXMLReport(java.io.File file, java.util.ArrayList<PatentItem> patents, java.util.ArrayList<BibDataSet> articles)
public void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
java.io.IOException