Issue
In the below example, I wish to return the last index ocurrance relative to the current row in which the "lower" was >= the "upper" column. I am able to do this with the results as expected but it is not truly vectorized and is very inefficient for larger dataframes.
import pandas as pd
# Sample DataFrame
data = {'lower': [7, 1, 6, 1, 1, 1, 1, 11, 1, 1],
'upper': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}
df = pd.DataFrame(data=data)
df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']))
df['DATE'] = pd.to_datetime(df['DATE'])
df.set_index('DATE', inplace=True)
# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]
recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
return recent_index
df['prev'] = df.apply(get_most_recent_index, axis=1)
print(df)
How would I rewrite this to be most efficient?
EDIT:
Firstly, thank you to you all for your responses.
On the topic of performance between the four viable solutions, the clear winner is bisect, proposed by Andrej Kesely. I have excluded pyjanitor as with any data volume approaching my set, we quickly run into memory alloc errors.
baseline: 1min 35s ± 5.15 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
bisect: 1.76 s ± 82.5 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)
enumerate: 1min 13s ± 2.17 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
import pandas as pd
import numpy as np
from bisect import bisect_left
import janitor
def get_sample_df(rows=100_000):
# Sample DataFrame
data = {'lower': np.random.default_rng(seed=1).uniform(1,100,rows),
'upper': np.random.default_rng(seed=2).uniform(1,100,rows)}
df = pd.DataFrame(data=data)
df = df.astype(int)
df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']), freq="min")
df['DATE'] = pd.to_datetime(df['DATE'])
df.set_index('DATE', inplace=True)
return df
def get_baseline():
df = get_sample_df()
# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]
recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
return recent_index
df['prev'] = df.apply(get_most_recent_index, axis=1)
return df
def get_pyjanitor():
df = get_sample_df()
df.reset_index(inplace=True)
# set the DATE column as an index
# after the operation you can set the original DATE
# column as an index
left_df = df.assign(index_prev=df.index)
right_df = df.assign(index_next=df.index)
out=(left_df
.conditional_join(
right_df,
('lower','upper','>='),
('index_prev','index_next','<'),
df_columns='index_prev',
right_columns=['index_next','lower','upper'])
)
# based on the matches, we may have multiple returns
# what we need is the closest to the current row
closest=out.index_next-out.index_prev
grouper=[out.index_next, out.lower,out.upper]
min_closest=closest.groupby(grouper).transform('min')
closest=closest==min_closest
# we have out matches, which is defined by `index_prev`
# use index_prev to get the relevant DATE
prev=out.loc[closest,'index_prev']
prev=df.loc[prev,'DATE'].array # avoid index alignment here
index_next=out.loc[closest,'index_next']
# now assign back to df, based on index_next and prev
prev=pd.Series(prev,index=index_next)
df = df.assign(prev=prev)
return df
def get_bisect():
df = get_sample_df()
def get_prev_bs(lower, upper, _date):
uniq_lower = sorted(set(lower))
last_seen = {}
for l, u, d in zip(lower, upper, _date):
# find index of element that is >= u
idx = bisect_left(uniq_lower, u)
max_date = None
for lv in uniq_lower[idx:]:
if lv in last_seen:
if max_date is None:
max_date = last_seen[lv]
elif last_seen[lv] > max_date:
max_date = last_seen[lv]
yield max_date
last_seen[l] = d
df["prev"] = list(get_prev_bs(df["lower"], df["upper"], df.index))
return df
def get_enumerate():
df = get_sample_df()
df.reset_index(inplace=True)
date_list=df["DATE"].values.tolist()
lower_list=df["lower"].values.tolist()
upper_list=df["upper"].values.tolist()
new_list=[]
for i,(x,y) in enumerate(zip(lower_list,upper_list)):
if i==0:
new_list.append(None)
else:
if (any(j >= y for j in lower_list[0:i])):
for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
if ll>=y:
new_list.append(dl)
break
else:
continue
else:
new_list.append(None)
df['prev']=new_list
df['prev']=pd.to_datetime(df['prev'])
return df
print("baseline:")
%timeit -n 2 -r 2 get_baseline()
# Unable to allocate 37.2 GiB for an array with shape (4994299505,) and data type int64
# print("pyjanitor:")
# %timeit -n 2 get_pyjanitor()
print("bisect:")
%timeit -n 2 -r 2 get_bisect()
print("enumerate:")
%timeit -n 2 -r 2 get_enumerate()
Solution
I'm not sure if this can be vectorized (because you have variables that are dependent on past state). But you can try to speed up the computation using binary search, e.g.:
from bisect import bisect_left
def get_prev(lower, upper, _date):
uniq_lower = sorted(set(lower))
last_seen = {}
for l, u, d in zip(lower, upper, _date):
# find index of element that is >= u
idx = bisect_left(uniq_lower, u)
max_date = None
for lv in uniq_lower[idx:]:
if lv in last_seen:
if max_date is None:
max_date = last_seen[lv]
elif last_seen[lv] > max_date:
max_date = last_seen[lv]
yield max_date
last_seen[l] = d
df["prev_new"] = list(get_prev(df["lower"], df["upper"], df.index))
print(df)
Prints:
lower upper prev prev_new
DATE
2020-01-01 7 2 NaT NaT
2020-01-02 1 3 2020-01-01 2020-01-01
2020-01-03 6 4 2020-01-01 2020-01-01
2020-01-04 1 5 2020-01-03 2020-01-03
2020-01-05 1 6 2020-01-03 2020-01-03
2020-01-06 1 7 2020-01-01 2020-01-01
2020-01-07 1 8 NaT NaT
2020-01-08 11 9 NaT NaT
2020-01-09 1 10 2020-01-08 2020-01-08
2020-01-10 1 11 2020-01-08 2020-01-08
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.