public class BasicStructureBuilder
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static java.util.regex.Pattern |
abstract_ |
static java.util.regex.Pattern |
headerNumbering1 |
static java.util.regex.Pattern |
headerNumbering2 |
static java.util.regex.Pattern |
headerNumbering3 |
static java.util.regex.Pattern |
headerNumbering4 |
static java.util.regex.Pattern |
introduction |
static java.util.regex.Pattern |
introductionStrict |
Constructor and Description |
---|
BasicStructureBuilder() |
Modifier and Type | Method and Description |
---|---|
boolean |
filterLineNumber(Document doc)
Filter out line numbering possibly present in the document.
|
static Document |
generalResultSegmentation(Document doc,
java.lang.String labeledResult,
java.util.List<LayoutToken> documentTokens) |
static Document |
resultSegmentation(Document doc,
java.lang.String labeledResult,
java.util.List<java.lang.String> tokenizations)
Set the main segments of the document based on the full text parsing results
|
public static java.util.regex.Pattern introduction
public static java.util.regex.Pattern introductionStrict
public static java.util.regex.Pattern abstract_
public static java.util.regex.Pattern headerNumbering1
public static java.util.regex.Pattern headerNumbering2
public static java.util.regex.Pattern headerNumbering3
public static java.util.regex.Pattern headerNumbering4
public boolean filterLineNumber(Document doc)
doc
- a documentpublic static Document generalResultSegmentation(Document doc, java.lang.String labeledResult, java.util.List<LayoutToken> documentTokens)
public static Document resultSegmentation(Document doc, java.lang.String labeledResult, java.util.List<java.lang.String> tokenizations)
doc
- a documentlabeledResult
- stringtokenizations
- tokens