public class TextUtilities
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
AND |
static java.util.regex.Pattern |
arXivPattern |
static java.lang.String |
COLON |
static java.lang.String |
COMMA |
static java.lang.String |
delimiters |
static java.util.regex.Pattern |
DOIPattern |
static java.lang.String |
DOUBLE_QUOTE |
static java.lang.String |
END_BRACKET |
static java.lang.String |
ESC_AND |
static java.lang.String |
ESC_DOUBLE_QUOTE |
static java.lang.String |
ESC_GREATER_THAN |
static java.lang.String |
ESC_LESS_THAN |
static java.lang.String |
fullPunctuations |
static java.lang.String |
GREATER_THAN |
static java.lang.String |
LESS_THAN |
static java.lang.String |
NEW_LINE |
static java.lang.String |
OR |
static java.util.regex.Pattern |
pmcidPattern |
static java.util.regex.Pattern |
pmidPattern |
static java.lang.String |
punctuations |
static java.lang.String |
QUOTE |
static java.lang.String |
restrictedPunctuations |
static java.lang.String |
SHARP |
static java.lang.String |
SLASH |
static java.lang.String |
SPACE |
static java.lang.String |
START_BRACKET |
static java.util.List<java.lang.String> |
stopwords |
static java.util.regex.Pattern |
urlPattern |
Constructor and Description |
---|
TextUtilities() |
Modifier and Type | Method and Description |
---|---|
static void |
appendN(java.lang.StringBuffer buffer,
char c,
int nb)
Appending nb times the char c to the a StringBuffer...
|
static java.lang.String |
capitalizeFully(java.lang.String input,
java.lang.String delimiters)
This is a re-implementation of the capitalizeFully of Apache commons lang, because it appears not working
properly.
|
static java.lang.String |
clean(java.lang.String token)
Map special ligature and characters coming from the pdf
|
static java.lang.String |
cleanField(java.lang.String input0,
boolean applyStopwordsFilter)
Remove useless punctuation at the end and beginning of a metadata field.
|
static java.lang.String |
convertStreamToString(java.io.InputStream is) |
static int |
countDigit(java.lang.String text)
Count the number of digit in a given string.
|
static java.util.List<LayoutToken> |
dehyphenize(java.util.List<LayoutToken> tokens)
Deprecated.
|
static java.lang.String |
dehyphenize(java.lang.String text) |
static java.lang.String |
dehyphenizeHard(java.lang.String text)
Deprecated.
|
protected static boolean |
doesRequireDehypenisation(java.util.List<LayoutToken> tokens,
int i)
Deprecated.
|
static boolean |
filterLine(java.lang.String line) |
static java.lang.String |
formatFourDecimals(double d) |
static java.lang.String |
formatTwoDecimals(double d) |
static java.util.List<java.lang.String> |
generateEmailVariants(java.lang.String firstName,
java.lang.String lastName) |
static java.lang.String |
getFirstToken(java.lang.String section) |
static java.lang.String |
getLastToken(java.lang.String section) |
static int |
getLevenshteinDistance(java.lang.String s,
java.lang.String t)
Levenstein distance between two strings
|
static int |
getNbTokens(java.lang.String line,
int currentLinePos,
java.util.List<java.lang.String> tokenization)
Return the number of token in a line given an existing global tokenization and a current
start position of the line in this global tokenization.
|
static int |
getOccCount(java.lang.String term,
java.lang.String string) |
static java.lang.String |
HTMLEncode(java.lang.String string)
Encode a string to be displayed in HTML
If fullHTML encode, then all unicode characters above 7 bits are converted into
HTML entitites
|
static java.lang.String |
HTMLEncode(java.lang.String string,
boolean fullHTML) |
static boolean |
isAllLowerCase(java.lang.String text) |
static boolean |
isAllUpperCase(java.lang.String text) |
static boolean |
isAllUpperCaseOrDigitOrDot(java.lang.String text)
Useful for recognising an acronym candidate: check if a text is only
composed of upper case, dot and digit characters
|
static java.lang.String |
JSONEncode(java.lang.String json) |
static java.lang.String |
normalizeRegex(java.lang.String string) |
static java.lang.String |
prefix(java.lang.String s,
int count)
Return the prefix of a string.
|
static java.lang.String |
punctuationProfile(java.lang.String line)
Give the punctuation profile of a line, i.e.
|
static java.lang.String |
removeAccents(java.lang.String input)
To replace accented characters in a unicode string by unaccented equivalents:
é -> e, ü -> ue, ß -> ss, etc.
|
static java.lang.StringBuilder |
replaceAll(java.lang.StringBuilder sb,
java.lang.String regex,
java.lang.String replacement)
The equivalent of String.replaceAll() for StringBuilder
|
static java.util.List<java.lang.String> |
segment(java.lang.String input,
java.lang.String segments)
Segment piece of text following a list of segmentation characters.
|
static java.lang.String |
shadowNumbers(java.lang.String string)
Replace numbers in the string by a dummy character for string distance evaluations
|
static java.lang.String |
strrep(char c,
int times) |
static java.lang.String |
suffix(java.lang.String s,
int count)
Return the suffix of a string.
|
static boolean |
test_digit(java.lang.String tok)
Test for the current string contains at least one digit.
|
static java.lang.String |
trimEncodedCharaters(java.lang.String string)
Ensure that special XML characters are correctly encoded.
|
static java.lang.String |
wordShape(java.lang.String word) |
static java.lang.String |
wordShapeTrimmed(java.lang.String word) |
public static final java.lang.String punctuations
public static final java.lang.String fullPunctuations
public static final java.lang.String restrictedPunctuations
public static java.lang.String delimiters
public static final java.lang.String OR
public static final java.lang.String NEW_LINE
public static final java.lang.String SPACE
public static final java.lang.String COMMA
public static final java.lang.String QUOTE
public static final java.lang.String END_BRACKET
public static final java.lang.String START_BRACKET
public static final java.lang.String SHARP
public static final java.lang.String COLON
public static final java.lang.String DOUBLE_QUOTE
public static final java.lang.String ESC_DOUBLE_QUOTE
public static final java.lang.String LESS_THAN
public static final java.lang.String ESC_LESS_THAN
public static final java.lang.String GREATER_THAN
public static final java.lang.String ESC_GREATER_THAN
public static final java.lang.String AND
public static final java.lang.String ESC_AND
public static final java.lang.String SLASH
public static final java.util.regex.Pattern DOIPattern
public static final java.util.regex.Pattern arXivPattern
public static final java.util.regex.Pattern pmidPattern
public static final java.util.regex.Pattern pmcidPattern
public static final java.util.regex.Pattern urlPattern
public static final java.util.List<java.lang.String> stopwords
public static java.lang.String shadowNumbers(java.lang.String string)
string
- the string to be processed.@Deprecated public static java.util.List<LayoutToken> dehyphenize(java.util.List<LayoutToken> tokens)
@Deprecated protected static boolean doesRequireDehypenisation(java.util.List<LayoutToken> tokens, int i)
public static java.lang.String dehyphenize(java.lang.String text)
public static java.lang.String getLastToken(java.lang.String section)
public static java.lang.String getFirstToken(java.lang.String section)
@Deprecated public static java.lang.String dehyphenizeHard(java.lang.String text)
text
- the string to be processed without preserved end of lines.Deprecated method, not needed anymore since the @newline are preserved thanks to the LayoutTokens
public static int getLevenshteinDistance(java.lang.String s, java.lang.String t)
s
- the first string to be compared.t
- the second string to be compared.public static final void appendN(java.lang.StringBuffer buffer, char c, int nb)
public static final java.lang.String removeAccents(java.lang.String input)
input
- the string to be processed.public static final java.lang.String cleanField(java.lang.String input0, boolean applyStopwordsFilter)
public static final java.util.List<java.lang.String> segment(java.lang.String input, java.lang.String segments)
input
- the string to be processed.input
- the characters creating a segment (typically space and punctuations).public static java.lang.String HTMLEncode(java.lang.String string)
public static java.lang.String HTMLEncode(java.lang.String string, boolean fullHTML)
public static java.lang.String normalizeRegex(java.lang.String string)
public static java.lang.String convertStreamToString(java.io.InputStream is)
public static int countDigit(java.lang.String text)
text
- the string to be processed.public static java.lang.String clean(java.lang.String token)
public static java.lang.String formatTwoDecimals(double d)
public static java.lang.String formatFourDecimals(double d)
public static boolean isAllUpperCase(java.lang.String text)
public static boolean isAllLowerCase(java.lang.String text)
public static java.util.List<java.lang.String> generateEmailVariants(java.lang.String firstName, java.lang.String lastName)
public static java.lang.String capitalizeFully(java.lang.String input, java.lang.String delimiters)
public static java.lang.String wordShape(java.lang.String word)
public static java.lang.String wordShapeTrimmed(java.lang.String word)
public static java.lang.String punctuationProfile(java.lang.String line)
line
- the string corresponding to a linejava.lang.Exception
public static int getNbTokens(java.lang.String line, int currentLinePos, java.util.List<java.lang.String> tokenization) throws java.lang.Exception
line
- the string corresponding to a linecurrentLinePos
- position of the line in the tokenizationtokenization
- the global tokenization where the line appearsjava.lang.Exception
public static java.lang.String trimEncodedCharaters(java.lang.String string)
public static boolean filterLine(java.lang.String line)
public static java.lang.StringBuilder replaceAll(java.lang.StringBuilder sb, java.lang.String regex, java.lang.String replacement)
public static java.lang.String prefix(java.lang.String s, int count)
public static java.lang.String suffix(java.lang.String s, int count)
public static java.lang.String JSONEncode(java.lang.String json)
public static java.lang.String strrep(char c, int times)
public static int getOccCount(java.lang.String term, java.lang.String string)
public static boolean test_digit(java.lang.String tok)
tok
- the string to be processed.public static boolean isAllUpperCaseOrDigitOrDot(java.lang.String text)