public class LayoutTokensUtil
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static <any> |
TO_TEXT_FUNCTION |
Constructor and Description |
---|
LayoutTokensUtil() |
Modifier and Type | Method and Description |
---|---|
static boolean |
containsToken(java.util.List<LayoutToken> toks,
java.lang.String text) |
static java.util.List<LayoutToken> |
dehyphenize(java.util.List<LayoutToken> tokens) |
protected static boolean |
doesRequireDehypenisation(java.util.List<LayoutToken> tokens,
int i)
Check if the current token (place i), or the hypen, needs to be removed or not.
|
static java.util.List<LayoutToken> |
enrichWithNewLineInfo(java.util.List<LayoutToken> toks) |
static java.lang.String |
getCoordsString(java.util.List<LayoutToken> toks) |
static java.lang.String |
getCoordsStringForOneBox(java.util.List<LayoutToken> toks) |
static java.util.List<LayoutToken> |
getLayoutTokensForTokenizedText(java.util.List<java.lang.String> tokens) |
static boolean |
newLineToken(java.lang.String tok) |
static boolean |
noCoords(LayoutToken t) |
static java.lang.String |
normalizeDehyphenizeText(java.util.List<LayoutToken> tokens) |
static java.lang.String |
normalizeText(java.util.List<LayoutToken> tokens) |
static java.lang.String |
normalizeText(java.lang.String text) |
static boolean |
spaceyToken(java.lang.String tok) |
static java.util.List<java.util.List<LayoutToken>> |
split(java.util.List<LayoutToken> toks,
java.util.regex.Pattern p,
boolean preserveSeparator) |
static java.util.List<java.util.List<LayoutToken>> |
split(java.util.List<LayoutToken> toks,
java.util.regex.Pattern p,
boolean preserveSeparator,
boolean preserveLeftOvers) |
static java.util.List<LayoutToken> |
subListByOffset(java.util.List<LayoutToken> token,
int startIncluded) |
static java.util.List<LayoutToken> |
subListByOffset(java.util.List<LayoutToken> token,
int startIncluded,
int endExcluded) |
static int |
tokenPos(java.util.List<LayoutToken> toks,
java.util.regex.Pattern p) |
static int |
tokenPos(java.util.List<LayoutToken> toks,
java.lang.String text) |
static boolean |
tooFarAwayVertically(java.util.List<BoundingBox> boxes,
double distance) |
static java.lang.String |
toText(java.util.List<LayoutToken> tokens) |
public static java.util.List<LayoutToken> enrichWithNewLineInfo(java.util.List<LayoutToken> toks)
public static java.lang.String normalizeText(java.lang.String text)
public static java.lang.String normalizeText(java.util.List<LayoutToken> tokens)
public static java.lang.String normalizeDehyphenizeText(java.util.List<LayoutToken> tokens)
public static java.lang.String toText(java.util.List<LayoutToken> tokens)
public static boolean noCoords(LayoutToken t)
public static boolean spaceyToken(java.lang.String tok)
public static boolean newLineToken(java.lang.String tok)
public static boolean containsToken(java.util.List<LayoutToken> toks, java.lang.String text)
public static int tokenPos(java.util.List<LayoutToken> toks, java.lang.String text)
public static int tokenPos(java.util.List<LayoutToken> toks, java.util.regex.Pattern p)
public static java.util.List<java.util.List<LayoutToken>> split(java.util.List<LayoutToken> toks, java.util.regex.Pattern p, boolean preserveSeparator)
public static java.util.List<java.util.List<LayoutToken>> split(java.util.List<LayoutToken> toks, java.util.regex.Pattern p, boolean preserveSeparator, boolean preserveLeftOvers)
public static boolean tooFarAwayVertically(java.util.List<BoundingBox> boxes, double distance)
public static java.lang.String getCoordsString(java.util.List<LayoutToken> toks)
public static java.lang.String getCoordsStringForOneBox(java.util.List<LayoutToken> toks)
public static java.util.List<LayoutToken> dehyphenize(java.util.List<LayoutToken> tokens)
protected static boolean doesRequireDehypenisation(java.util.List<LayoutToken> tokens, int i)
It will check the tokens before and after. It will get to the next "non space" tokens and verify that it's a plain word. If it's not it's keeping the hypen.
TODO: What to do in case of a punctuation is found?
public static java.util.List<LayoutToken> subListByOffset(java.util.List<LayoutToken> token, int startIncluded)
public static java.util.List<LayoutToken> subListByOffset(java.util.List<LayoutToken> token, int startIncluded, int endExcluded)
public static java.util.List<LayoutToken> getLayoutTokensForTokenizedText(java.util.List<java.lang.String> tokens)