The following table shows the pandas APIs that implemented or non-implemented from pandas API on Spark. Some pandas API do not implement full parameters, so the third column shows missing parameters for each API.
‘Y’ in the second column means it’s implemented including its whole parameter.
‘N’ means it’s not implemented yet.
‘P’ means it’s partially implemented with the missing of some parameters.
All API in the list below computes the data with distributed execution except the ones that require the local execution by design. For example, DataFrame.to_numpy() requires to collect the data to the driver side.
If there is non-implemented pandas API or parameter you want, you can create an Apache Spark JIRA to request or to contribute by your own.
The API list is updated based on the pandas 1.3 official API reference.
API
Implemented
Missing parameters
T()
Y
abs()
add()
P
axis, level, fill_value
axis
level
fill_value
add_prefix()
add_suffix()
agg()
aggregate()
align()
fill_value, method, limit, fill_axis
method
limit
fill_axis
all()
skipna, level, bool_only
skipna
bool_only
any()
append()
apply()
raw, result_type
raw
result_type
applymap()
na_action
asfreq
N
asof
assign()
astype()
copy, errors
copy
errors
at()
at_time()
attrs
axes()
backfill()
between_time()
inclusive
bfill()
bool()
boxplot
clip()
axis, inplace
inplace
columns()
combine
combine_first
compare
convert_dtypes
copy()
corr()
min_periods
corrwith
count()
cov()
ddof
cummax()
cummin()
cumprod()
cumsum()
describe()
include, exclude, datetime_is_numeric
include
exclude
datetime_is_numeric
diff()
div()
divide()
dot()
drop()
index, level, inplace, errors
index
drop_duplicates()
ignore_index
droplevel()
dropna()
dtypes()
duplicated()
empty()
eq()
axis, level
equals()
eval()
ewm()
expanding()
explode()
ffill()
fillna()
downcast
filter()
first()
first_valid_index()
flags
floordiv()
from_dict()
from_records()
ge()
get()
groupby()
gt()
head()
hist()
iat()
idxmax()
idxmin()
iloc()
index()
infer_objects
info()
show_counts
insert()
interpolate()
isin()
isna()
isnull()
items()
iteritems()
iterrows()
itertuples()
join()
sort
keys()
kurt()
skipna, level
kurtosis()
last()
last_valid_index()
le()
loc()
lookup
lt()
mad()
mask()
inplace, axis, level, errors
max()
mean()
median()
melt()
col_level, ignore_index
col_level
memory_usage
merge()
sort, copy, indicator, validate
indicator
validate
min()
mod()
mode
mul()
multiply()
ndim()
ne()
nlargest()
keep
notna()
notnull()
nsmallest()
nunique()
pad()
pct_change()
fill_method, limit, freq
fill_method
freq
pipe()
pivot()
pivot_table()
plot.area()
plot.bar()
plot.barh()
plot.box()
plot.density()
plot.hexbin()
plot.hist()
plot.kde()
plot.line()
plot.pie()
plot.scatter()
pop()
pow()
prod()
product()
quantile()
interpolation
query()
radd()
rank()
axis, na_options, pct, numeric_only
na_options
pct
numeric_only
rdiv()
reindex()
method, level, limit, tolerance
tolerance
reindex_like()
method, limit, tolerance
rename()
rename_axis()
reorder_levels
replace()
regex, method
regex
resample
reset_index()
rfloordiv()
rmod()
rmul()
rolling()
round()
rpow()
rsub()
rtruediv()
sample()
weights, axis, ignore_index
weights
select_dtypes()
sem()
set_axis
set_flags
set_index()
verify_integrity
shape()
shift()
freq, axis
size()
skew()
slice_shift
sort_index()
sort_remaining, ignore_index, key
sort_remaining
key
sort_values()
sparse
squeeze()
stack()
level, dropna
dropna
std()
style()
sub()
subtract()
sum()
swapaxes()
swaplevel()
tail()
take()
to_clipboard()
to_csv()
encoding, compression, quoting, line_terminator, cunksize and more. See the pandas.DataFrame.t o_csv and pyspark.pandas.DataFra me.to_csv for detail.
encoding
compression
quoting
line_terminator
cunksize
to_dict()
to_excel()
storage_options
to_feather
to_gbq
to_hdf
to_html()
to_json()
date_format, double_precision, force_ascii, date_unit, default_handler and more. See th e pandas.DataFrame.to_json and pyspark.pandas.DataFrame. to_json for detail.
date_format
double_precision
force_ascii
date_unit
default_handler
to_latex()
caption, label, position
caption
label
position
to_markdown()
to_numpy()
to_parquet()
engine, storage_options
engine
to_period
to_pickle
to_records()
to_sql
to_stata
to_string()
to_timestamp
to_xarray
to_xml
transform()
transpose()
truediv()
truncate()
tshift
tz_convert
tz_localize
unstack()
level, fill_value
update()
filter_func, errors
filter_func
value_counts
values()
var()
where()
inplace, level, errors
xs()
drop_level
read_pickle
DataFrame.to_pickle
read_table()
read_csv()
converters, true_values, false_values, skipinitialspace, skiprows and more. See the pandas.re ad_csv an d pyspark.pandas.read_csv for detail.
converters
true_values
false_values
skipinitialspace
skiprows
DataFrame.to_csv()
read_fwf
read_clipboard()
DataFrame.to_clipboard()
read_excel()
skiprows, na_filter, decimal, skipfooter, storage_options
na_filter
decimal
skipfooter
DataFrame.to_excel()
read_json()
orient, typ, dtype, convert_axes, convert_dates and m ore. See the pandas.read_json and pyspark.pandas. read_json f or detail.
orient
typ
dtype
convert_axes
convert_dates
DataFrame.to_json()
date_format, double_precision, force_ascii, date_unit, default_handler and more. See the pa ndas.DataFrame.to_json and pyspark.pandas. to_json for detail.
read_html()
DataFrame.to_html()
read_xml
DataFrame.to_xml
DataFrame.to_latex()
read_hdf
read_feather
DataFrame.to_feather
read_parquet()
engine, storage_options, use_nullable_dtypes
use_nullable_dtypes
DataFrame.to_parquet()
read_orc()
read_sas
read_spss
read_sql_table()
coerce_float, parse_dates, chunksize
coerce_float
parse_dates
chunksize
read_sql_query()
coerce_float, params, parse_dates, chunksize, dtype
params
read_sql()
coerce_float, params, parse_dates, chunksize
DataFrame.to_sql
read_gbq
read_stata
DataFrame.to_stata
pivot
pivot_table
crosstab
cut
qcut
copy, indicator, validate
merge_ordered
merge_asof()
concat()
keys, levels, names, verify_integrity, copy
keys
levels
names
get_dummies()
factorize
unique
wide_to_long
to_numeric()
errors, downcast
to_datetime()
dayfirst, yearfirst, utc, exact
dayfirst
yearfirst
utc
exact
date_range()
bdate_range
period_range
timedelta_range()
infer_freq
interval_range
eval
level, fill_value, method, limit, fill_axis
bool_only, skipna, level
convert_dtype
argmax()
axis, skipna
argmin()
argsort()
axis, kind, order
kind
order
array
asof()
subset
autocorr
between()
cat()
combine_first()
compare()
align_axis
fill_value, level
divmod()
columns, inplace, errors
columns
dt()
dtype()
factorize()
hasnans()
is_monotonic()
is_monotonic_decreasing() | Y |
is_monotonic_decreasing()
is_monotonic_increasing() | Y |
is_monotonic_increasing()
is_unique()
item()
axis, skipna, level
map()
mode()
name()
nbytes
axis, na_option, pct, numeric_only
na_option
ravel
rdivmod()
method, copy, level, limit, tolerance
method, copy, limit, tolerance
axis, copy, inplace
repeat()
inplace, limit, regex, method
weight, axis
weight
searchsorted
axis, kind, key, ignore_index
str()
encoding, compression, quoting, line_terminator, cunksize and more. See the pandas.Se ries.to_csv and pyspark.pandas.Series.to_c sv fo r detail.
to_frame()
date_format, double_precision, force_ascii, default_handler, storage_options
to_list()
min_rows
tolist
unique()
value_counts()
view
axis, drop_level
argsort
asi8()
asof_locs
delete()
difference()
how
duplicated
format
get_indexer
get_indexer_for
get_indexer_non_unique
get_level_values
get_loc
get_slice_bound
get_value
groupby
has_duplicates()
holds_integer
identical()
inferred_type()
intersection()
is_
is_all_dates()
is_boolean()
is_categorical()
is_floating()
is_integer()
is_interval()
is_mixed
is_numeric()
is_object()
is_type_compatible()
isnull
join
names()
nlevels()
putmask
reindex
set_names()
set_value
slice_indexer
slice_locs
na_position, key, return_indexer
na_position
return_indexer
sortlevel
symmetric_difference()
axis, allow_fill, fill_value
allow_fill
to_flat_index
to_native_types
to_series()
transpose
union()
view()
where
Rolling.agg
Rolling.aggregate
Rolling.apply
Rolling.axis
Rolling.center
Rolling.closed
Rolling.corr
Rolling.count()
Rolling.cov
Rolling.exclusions
Rolling. is_datetimelike
Rolling.kurt
Rolling.max()
Rolling.mean()
Rolling.median
Rolling.method
Rolling.min()
Rolling.min_periods
Rolling.ndim
Rolling.obj
Rolling.on
Rolling.quantile
Rolling.sem
Rolling.skew
Rolling.std
Rolling.sum()
Rolling.validate
Rolling.var
Rolling.win_type
Rolling.window
Expanding.agg
Expanding.aggregate
Expanding.apply
Expanding.axis
Expanding.center
Expanding.closed
Expanding.corr
Expanding.count()
Expanding.cov
Expanding.exclusions
Expanding.is_datetimelike
Expanding.kurt
Expanding.max()
Expanding.mean()
Expanding.median
Expanding.method
Expanding.min()
Expanding.min_periods
Expanding.ndim
Expanding.obj
Expanding.on
Expanding.quantile
Expanding.sem
Expanding.skew
Expanding.std
Expanding.sum()
Expanding.validate
Expanding.var
Expanding.win_type
Expanding.window
DataFrameGroupBy.agg()
DataFrameGroupBy.aggregate()
GroupBy.all()
GroupBy.any()
GroupBy.apply()
GroupBy.backfill()
GroupBy.bfill()
corr
GroupBy.count()
cov
GroupBy.cumcount()
GroupBy.cummax()
GroupBy.cummin()
GroupBy.cumprod()
GroupBy.cumsum()
DataFrameGroupBy.describe()
percentiles, include, exclude, datetime_is_numeric
percentiles
GroupBy.diff()
dtypes
ewm
GroupBy.ffill()
GroupBy.fillna()
GroupBy.filter()
GroupBy.first()
numeric_only, min_count
min_count
GroupBy.get_group()
groups
GroupBy.head()
hist
GroupBy.idxmax()
GroupBy.idxmin()
indices
GroupBy.last()
mad
GroupBy.max()
GroupBy.mean()
numeric_only, engine
GroupBy.median()
GroupBy.min()
ndim
ngroup
ngroups
nth
GroupBy.nunique()
ohlc
pad
pct_c hange
pipe
plot
prod
qua ntile
GroupBy.rank()
na_option, pct, axis
sample
sem
GroupBy.shift()
GroupBy.size()
skew
GroupBy.std()
GroupBy.sum()
GroupBy.tail()
take
GroupBy.transform()
GroupBy.var()