import pandas as pd
import warnings
import wrapt
verb_registry = {}
property_registry = {}
dppd_types = set([None]) # which types are handled by dppd, others drop out of the pipe
[docs]class register_verb:
"""Register a function to act as a Dppd verb.
First parameter of the function must be the DataFrame being worked on.
Note that for grouped Dppds, the function get's called once per group.
Example::
register_verb('upper_case_all_columns')(
lambda df: df.assign(**{
name: df[name].str.upper() for name in df.columns})
"""
def __init__(self, name=None, types=None, pass_dppd=False):
"""
Parameters:
-----------
names : str, list or None
May be omitted, then it get's set to the functions __name__.
If it's a list, register aliases right away
Must be a valid identifer.
types : type or [type, type,...]
this verb only applies to these types
pass_dppd:
this func will get dppd instead of dppd.df (e.g. for dir)
"""
self.names = name
if not isinstance(types, list):
types = [types]
self.types = types
for t in types:
dppd_types.add(t)
if t not in property_registry:
property_registry[t] = set()
self.pass_dppd = pass_dppd
def __call__(self, func):
if self.names is None:
real_names = [func.__name__]
else:
if not isinstance(self.names, list):
real_names = [self.names]
else:
real_names = self.names
def outer(dppd):
def inner(*args, **kwargs):
if self.pass_dppd:
result = func(dppd, *args, **kwargs)
else:
result = func(dppd.df, *args, **kwargs)
# no verbs:
if type(result) in dppd_types:
return dppd._descend(result)
else:
return result
return inner
for real_name in real_names:
if not real_name.isidentifier():
raise TypeError(
"name passed to register_verb must be a valid python identifier"
)
for t in self.types:
if (real_name, t) in verb_registry and verb_registry[
(real_name, t)
] != func:
warnings.warn(f"redefining verb {real_name} for type {t}")
if t in property_registry and real_name in property_registry[t]:
warnings.warn(f"verb {real_name} shadows property for type {t}")
outer.__doc__ == func.__doc__
for t in self.types:
verb_registry[real_name, t] = outer
return func
[docs]def register_property(name, types=None):
"""Register a property/indexed accessor to be forwarded (.something[])"""
if not isinstance(types, list):
types = [types]
for t in types:
if (name, t) in verb_registry:
warnings.warn("Property always shadowed by verb: %s" % name)
if t not in property_registry:
property_registry[t] = set()
property_registry[t].add(name)
dppd_types.add(t)
[docs]def register_type_methods_as_verbs(cls, excluded):
for df_method in dir(cls):
if df_method not in excluded:
if not df_method.startswith("_"):
try:
attr = getattr(cls, df_method)
if hasattr(attr, "__call__"):
register_verb(df_method, types=cls)(attr)
else:
register_property(df_method, types=cls)
except AttributeError as e: # pragma: no cover
# this happens in pandas < 0.23 for DataFrame.columns
if "'NoneType' object has no attribute '_data'" in str(e):
register_property(df_method, types=cls)
else:
# just a defensive measure
raise # pragma: no cover
[docs]class Dppd:
"""
Dataframe maniPulater maniPulates Dataframes
A DataFrame manipulation object, offering verbs,
and each verb returns another Dppd.
All pandas.DataFrame methods have been turned into verbs.
Accessors like loc also work.
"""
def __init__(self, df, dppd_proxy, X, parent):
if isinstance(df, wrapt.ObjectProxy):
df = df._get_wrapped()
elif isinstance(df, Dppd):
df = df.df
if df is not None and type(df) not in dppd_types:
raise ValueError(
f"Dppd was passed a {type(df)} for which no properties have "
"been registered. That sounds like a bug."
)
self.df = df
self._dppd_proxy = dppd_proxy
self.X = X # the StackAwareDataframe proxy
dppd_proxy._self_update_wrapped(self)
self.X._self_update_wrapped(self.df)
self.parent = parent
def _descend(self, new_df, parent=None):
if new_df is None:
raise ValueError()
return Dppd(
new_df,
self._dppd_proxy,
self.X,
parent if parent is not None else self.parent,
)
@property
def pd(self):
"""Return the actual, unproxyied DataFrame"""
result = self.df
if self.parent is not None:
self._dppd_proxy._self_update_wrapped(self.parent)
self.X._self_update_wrapped(self.parent.df)
return result
def __call__(self, df=None):
if df is None:
if self.df is None:
raise ValueError("You have to call dp(df) before calling dp()")
return self
else:
last = self._dppd_proxy._get_wrapped()
return self._descend(df, parent=last)
def __getattr__(self, attr):
if attr == "__qualname__": # pragma: no cover
raise AttributeError(
"%s object has no attribute '__qualname__'" % (type(self))
)
if self.df is None:
raise ValueError("Dppd not initialized with a DataFrame")
if (attr, type(self.df)) in verb_registry:
return verb_registry[attr, type(self.df)](self)
elif (attr, None) in verb_registry:
return verb_registry[attr, None](self)
elif attr in property_registry[type(self.df)]:
return GetItemProxy(getattr(self.df, attr), self)
# if attr in property_registry[None]:
# return GetItemProxy(getattr(self.df, attr), self)
else:
raise AttributeError(attr, type(self.df))
def __getitem__(self, slice):
return self._descend(self.df[slice])
def __dir__(self):
result = set()
my_typ = type(self.df)
for name, typ in verb_registry.keys():
if typ is None or typ is my_typ:
result.add(name)
for name in property_registry[my_typ]:
result.add(name)
return sorted(result)
class ReplacableProxy(wrapt.ObjectProxy):
"""A proxy that can change what it proxies for
:autodoc_skip:
"""
def _self_update_wrapped(self, w):
self.__wrapped__ = w
def _get_wrapped(self):
return self.__wrapped__
def __call__(self, *args, **kwargs):
return self.__wrapped__(*args, **kwargs)
@property
def pd(self):
res = self._get_wrapped()
if isinstance(res, Dppd):
return res.pd
else:
return res
class DPPDAwareProxy(ReplacableProxy):
"""A replacable DataFrame proxy that also offers itergroups
:autodoc_skip:
"""
def __init__(self, wrapped, dppd_proxy):
self._self_dppd_proxy = dppd_proxy
super().__init__(wrapped)
def itergroups(self):
yield from self._self_dppd_proxy.itergroups()
class GetItemProxy(wrapt.ObjectProxy):
"""helper for accessor properties on DataFrames
:autodoc_skip:
"""
def __init__(self, wrapped, dppd):
self._self_dppd = dppd
super().__init__(wrapped)
def __getitem__(self, slice):
result = self.__wrapped__[slice]
if isinstance(result, pd.DataFrame) or isinstance(result, pd.Series):
return self._self_dppd._descend(result)
else:
return result
@property
def pd(self):
return self.__wrapped__
[docs]class dppd:
"""Context manager for Dppd.
Usage::
```
with cdp(mtcars) as (dp, X):
dp.groupby('cyl')
dp.arrange(X.hp))
dp.head(1)
print(X)
```
Both X and dp are a proxyied DataFrame after the context manager.
They should work just like a DataFrame,
use X.pd() to convert it into a true DataFrame.
Alternate usage::
dp, X = dppd()
dp(df).mutate(y=X['column'] * 2, ...).filter(...).select(...).pd
or::
dp(df).mutate(...)
dp.filter()
dp.select()
new_df = dp.pd
"""
def __init__(self, df=None):
self.df = df
# So that dp() is always the lastes
self.__dppd_proxy = ReplacableProxy(None)
# and X is always the latest DataFrame.
self.__X_proxy = DPPDAwareProxy(None, self.__dppd_proxy)
self.dppd = Dppd(self.df, self.__dppd_proxy, self.__X_proxy, None)
def __iter__(self):
"""Support to be able to say dp, X = dppd().
hacky, but does work
"""
# yield self.call_callback
yield self.__dppd_proxy
yield self.__X_proxy
def __enter__(self):
"""Context manager support"""
return (self.__dppd_proxy, self.__X_proxy)
def __exit__(self, _type, _value, _traceback):
# at midnight, both the carriage and the horses
# turn back into (wrapped) DataFrames
self.__X_proxy.__wrapped__ = self.__dppd_proxy.df
self.__dppd_proxy.__wrapped__ = self.__dppd_proxy.df
del self.df
all = [dppd, register_verb]