public class RegexTokenizer extends UnaryTransformer<String,scala.collection.Seq<String>,RegexTokenizer> implements DefaultParamsWritable
gaps
is false).
Optional parameters also allow filtering tokens using a minimal length.
It returns an array of strings that can be empty.Constructor and Description |
---|
RegexTokenizer() |
RegexTokenizer(String uid) |
Modifier and Type | Method and Description |
---|---|
static Params |
clear(Param<?> param) |
RegexTokenizer |
copy(ParamMap extra)
Creates a copy of this instance with the same UID and some extra params.
|
static String |
explainParam(Param<?> param) |
static String |
explainParams() |
static ParamMap |
extractParamMap() |
static ParamMap |
extractParamMap(ParamMap extra) |
BooleanParam |
gaps()
Indicates whether regex splits on gaps (true) or matches tokens (false).
|
static <T> scala.Option<T> |
get(Param<T> param) |
static <T> scala.Option<T> |
getDefault(Param<T> param) |
boolean |
getGaps() |
static String |
getInputCol() |
int |
getMinTokenLength() |
static <T> T |
getOrDefault(Param<T> param) |
static String |
getOutputCol() |
static Param<Object> |
getParam(String paramName) |
String |
getPattern() |
boolean |
getToLowercase() |
static <T> boolean |
hasDefault(Param<T> param) |
static boolean |
hasParam(String paramName) |
static Param<String> |
inputCol() |
static boolean |
isDefined(Param<?> param) |
static boolean |
isSet(Param<?> param) |
static RegexTokenizer |
load(String path) |
IntParam |
minTokenLength()
Minimum token length, greater than or equal to 0.
|
static Param<String> |
outputCol() |
static Param<?>[] |
params() |
Param<String> |
pattern()
Regex pattern used to match delimiters if
gaps is true or tokens if gaps is false. |
static void |
save(String path) |
static <T> Params |
set(Param<T> param,
T value) |
RegexTokenizer |
setGaps(boolean value) |
static T |
setInputCol(String value) |
RegexTokenizer |
setMinTokenLength(int value) |
static T |
setOutputCol(String value) |
RegexTokenizer |
setPattern(String value) |
RegexTokenizer |
setToLowercase(boolean value) |
BooleanParam |
toLowercase()
Indicates whether to convert all characters to lowercase before tokenizing.
|
static String |
toString() |
static Dataset<Row> |
transform(Dataset<?> dataset) |
static Dataset<Row> |
transform(Dataset<?> dataset,
ParamMap paramMap) |
static Dataset<Row> |
transform(Dataset<?> dataset,
ParamPair<?> firstParamPair,
ParamPair<?>... otherParamPairs) |
static Dataset<Row> |
transform(Dataset<?> dataset,
ParamPair<?> firstParamPair,
scala.collection.Seq<ParamPair<?>> otherParamPairs) |
static StructType |
transformSchema(StructType schema) |
String |
uid()
An immutable unique ID for the object and its derivatives.
|
static MLWriter |
write() |
setInputCol, setOutputCol, transform, transformSchema
transform, transform, transform
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
write
save
getInputCol, inputCol
getOutputCol, outputCol
clear, copyValues, defaultCopy, defaultParamMap, explainParam, explainParams, extractParamMap, extractParamMap, get, getDefault, getOrDefault, getParam, hasDefault, hasParam, isDefined, isSet, paramMap, params, set, set, set, setDefault, setDefault, shouldOwn
toString
initializeLogging, initializeLogIfNecessary, initializeLogIfNecessary, isTraceEnabled, log_, log, logDebug, logDebug, logError, logError, logInfo, logInfo, logName, logTrace, logTrace, logWarning, logWarning
public RegexTokenizer(String uid)
public RegexTokenizer()
public static RegexTokenizer load(String path)
public static String toString()
public static Param<?>[] params()
public static String explainParam(Param<?> param)
public static String explainParams()
public static final boolean isSet(Param<?> param)
public static final boolean isDefined(Param<?> param)
public static boolean hasParam(String paramName)
public static Param<Object> getParam(String paramName)
public static final <T> scala.Option<T> get(Param<T> param)
public static final <T> T getOrDefault(Param<T> param)
public static final <T> scala.Option<T> getDefault(Param<T> param)
public static final <T> boolean hasDefault(Param<T> param)
public static final ParamMap extractParamMap()
public static Dataset<Row> transform(Dataset<?> dataset, ParamPair<?> firstParamPair, scala.collection.Seq<ParamPair<?>> otherParamPairs)
public static Dataset<Row> transform(Dataset<?> dataset, ParamPair<?> firstParamPair, ParamPair<?>... otherParamPairs)
public static final Param<String> inputCol()
public static final String getInputCol()
public static final Param<String> outputCol()
public static final String getOutputCol()
public static T setInputCol(String value)
public static T setOutputCol(String value)
public static StructType transformSchema(StructType schema)
public static void save(String path) throws java.io.IOException
java.io.IOException
public static MLWriter write()
public String uid()
Identifiable
uid
in interface Identifiable
public IntParam minTokenLength()
public RegexTokenizer setMinTokenLength(int value)
public int getMinTokenLength()
public BooleanParam gaps()
public RegexTokenizer setGaps(boolean value)
public boolean getGaps()
public Param<String> pattern()
gaps
is true or tokens if gaps
is false.
Default: "\\s+"
public RegexTokenizer setPattern(String value)
public String getPattern()
public final BooleanParam toLowercase()
public RegexTokenizer setToLowercase(boolean value)
public boolean getToLowercase()
public RegexTokenizer copy(ParamMap extra)
Params
defaultCopy()
.copy
in interface Params
copy
in class UnaryTransformer<String,scala.collection.Seq<String>,RegexTokenizer>
extra
- (undocumented)