public class HadoopRDD<K,V> extends RDD<scala.Tuple2<K,V>> implements Logging
org.apache.hadoop.mapred
).
Note: Instantiating this class directly is not recommended, please use
org.apache.spark.SparkContext.hadoopRDD()
param: sc The SparkContext to associate the RDD with. param: broadcastedConf A general Hadoop Configuration, or a subclass of it. If the enclosed variabe references an instance of JobConf, then that JobConf will be used for the Hadoop job. Otherwise, a new JobConf will be created on each slave using the enclosed Configuration. param: initLocalJobConfFuncOpt Optional closure used to initialize any JobConf that HadoopRDD creates. param: inputFormatClass Storage format of the data to be read. param: keyClass Class of the key associated with the inputFormatClass. param: valueClass Class of the value associated with the inputFormatClass. param: minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate.
Constructor and Description |
---|
HadoopRDD(SparkContext sc,
Broadcast<org.apache.spark.util.SerializableConfiguration> broadcastedConf,
scala.Option<scala.Function1<org.apache.hadoop.mapred.JobConf,scala.runtime.BoxedUnit>> initLocalJobConfFuncOpt,
java.lang.Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass,
java.lang.Class<K> keyClass,
java.lang.Class<V> valueClass,
int minPartitions) |
HadoopRDD(SparkContext sc,
org.apache.hadoop.mapred.JobConf conf,
java.lang.Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass,
java.lang.Class<K> keyClass,
java.lang.Class<V> valueClass,
int minPartitions) |
Modifier and Type | Method and Description |
---|---|
static void |
addLocalConfiguration(java.lang.String jobTrackerId,
int jobId,
int splitId,
int attemptId,
org.apache.hadoop.mapred.JobConf conf)
Add Hadoop configuration specific to a single partition and attempt.
|
void |
checkpoint()
Mark this RDD for checkpointing.
|
InterruptibleIterator<scala.Tuple2<K,V>> |
compute(Partition theSplit,
TaskContext context)
:: DeveloperApi ::
Implemented by subclasses to compute a given partition.
|
static java.lang.Object |
CONFIGURATION_INSTANTIATION_LOCK()
Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
|
static boolean |
containsCachedMetadata(java.lang.String key) |
static java.lang.Object |
getCachedMetadata(java.lang.String key)
The three methods below are helpers for accessing the local map, a property of the SparkEnv of
the local process.
|
org.apache.hadoop.conf.Configuration |
getConf() |
protected org.apache.hadoop.mapred.InputFormat<K,V> |
getInputFormat(org.apache.hadoop.mapred.JobConf conf) |
protected org.apache.hadoop.mapred.JobConf |
getJobConf() |
Partition[] |
getPartitions()
Implemented by subclasses to return the set of partitions in this RDD.
|
scala.collection.Seq<java.lang.String> |
getPreferredLocations(Partition split)
Optionally overridden by subclasses to specify placement preferences.
|
protected java.lang.String |
inputFormatCacheKey() |
protected java.lang.String |
jobConfCacheKey() |
<U> RDD<U> |
mapPartitionsWithInputSplit(scala.Function2<org.apache.hadoop.mapred.InputSplit,scala.collection.Iterator<scala.Tuple2<K,V>>,scala.collection.Iterator<U>> f,
boolean preservesPartitioning,
scala.reflect.ClassTag<U> evidence$1)
Maps over a partition, providing the InputSplit that was used as the base of the partition.
|
HadoopRDD<K,V> |
persist(StorageLevel storageLevel)
Set this RDD's storage level to persist its values across operations after the first time
it is computed.
|
static int |
RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES()
Update the input bytes read metric each time this number of records has been read
|
static scala.Option<org.apache.spark.rdd.HadoopRDD.SplitInfoReflections> |
SPLIT_INFO_REFLECTIONS() |
aggregate, cache, cartesian, checkpointData, clearDependencies, coalesce, collect, collect, context, count, countApprox, countApproxDistinct, countApproxDistinct, countByValue, countByValueApprox, creationSite, dependencies, distinct, distinct, doubleRDDToDoubleRDDFunctions, filter, filterWith, first, firstParent, flatMap, flatMapWith, fold, foreach, foreachPartition, foreachWith, getCheckpointFile, getDependencies, getStorageLevel, glom, groupBy, groupBy, groupBy, id, intersection, intersection, intersection, isCheckpointed, isEmpty, iterator, keyBy, localCheckpoint, map, mapPartitions, mapPartitionsWithContext, mapPartitionsWithIndex, mapPartitionsWithSplit, mapWith, max, min, name, numericRDDToDoubleRDDFunctions, parent, partitioner, partitions, persist, pipe, pipe, pipe, preferredLocations, randomSplit, rddToAsyncRDDActions, rddToOrderedRDDFunctions, rddToPairRDDFunctions, rddToSequenceFileRDDFunctions, reduce, repartition, sample, saveAsObjectFile, saveAsTextFile, saveAsTextFile, scope, setName, sortBy, sparkContext, subtract, subtract, subtract, take, takeOrdered, takeSample, toArray, toDebugString, toJavaRDD, toLocalIterator, top, toString, treeAggregate, treeReduce, union, unpersist, zip, zipPartitions, zipPartitions, zipPartitions, zipPartitions, zipPartitions, zipPartitions, zipWithIndex, zipWithUniqueId
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
initializeIfNecessary, initializeLogging, isTraceEnabled, log_, log, logDebug, logDebug, logError, logError, logInfo, logInfo, logName, logTrace, logTrace, logWarning, logWarning
public HadoopRDD(SparkContext sc, Broadcast<org.apache.spark.util.SerializableConfiguration> broadcastedConf, scala.Option<scala.Function1<org.apache.hadoop.mapred.JobConf,scala.runtime.BoxedUnit>> initLocalJobConfFuncOpt, java.lang.Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass, java.lang.Class<K> keyClass, java.lang.Class<V> valueClass, int minPartitions)
public HadoopRDD(SparkContext sc, org.apache.hadoop.mapred.JobConf conf, java.lang.Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass, java.lang.Class<K> keyClass, java.lang.Class<V> valueClass, int minPartitions)
public static java.lang.Object CONFIGURATION_INSTANTIATION_LOCK()
public static int RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES()
public static java.lang.Object getCachedMetadata(java.lang.String key)
key
- (undocumented)public static boolean containsCachedMetadata(java.lang.String key)
public static void addLocalConfiguration(java.lang.String jobTrackerId, int jobId, int splitId, int attemptId, org.apache.hadoop.mapred.JobConf conf)
public static scala.Option<org.apache.spark.rdd.HadoopRDD.SplitInfoReflections> SPLIT_INFO_REFLECTIONS()
protected java.lang.String jobConfCacheKey()
protected java.lang.String inputFormatCacheKey()
protected org.apache.hadoop.mapred.JobConf getJobConf()
protected org.apache.hadoop.mapred.InputFormat<K,V> getInputFormat(org.apache.hadoop.mapred.JobConf conf)
public Partition[] getPartitions()
RDD
getPartitions
in class RDD<scala.Tuple2<K,V>>
public InterruptibleIterator<scala.Tuple2<K,V>> compute(Partition theSplit, TaskContext context)
RDD
public <U> RDD<U> mapPartitionsWithInputSplit(scala.Function2<org.apache.hadoop.mapred.InputSplit,scala.collection.Iterator<scala.Tuple2<K,V>>,scala.collection.Iterator<U>> f, boolean preservesPartitioning, scala.reflect.ClassTag<U> evidence$1)
public scala.collection.Seq<java.lang.String> getPreferredLocations(Partition split)
RDD
getPreferredLocations
in class RDD<scala.Tuple2<K,V>>
split
- (undocumented)public void checkpoint()
RDD
SparkContext#setCheckpointDir
and all references to its parent
RDDs will be removed. This function must be called before any job has been
executed on this RDD. It is strongly recommended that this RDD is persisted in
memory, otherwise saving it on a file will require recomputation.checkpoint
in class RDD<scala.Tuple2<K,V>>
public HadoopRDD<K,V> persist(StorageLevel storageLevel)
RDD
public org.apache.hadoop.conf.Configuration getConf()