Pandas目前不支持合并语法中的不等式连接;一个选项是使用pyjanator的conditional_join函数-我是这个库的贡献者:
# pip install pyjanitor
import pandas as pd
import janitor
left.conditional_join(right, ('value', 'value', '>'))
left right
key value key value
0 A 1.764052 D -0.977278
1 A 1.764052 F -0.151357
2 A 1.764052 E 0.950088
3 B 0.400157 D -0.977278
4 B 0.400157 F -0.151357
5 C 0.978738 D -0.977278
6 C 0.978738 F -0.151357
7 C 0.978738 E 0.950088
8 D 2.240893 D -0.977278
9 D 2.240893 F -0.151357
10 D 2.240893 E 0.950088
11 D 2.240893 B 1.867558
left.conditional_join(right, ('value', 'value', '<'))
left right
key value key value
0 A 1.764052 B 1.867558
1 B 0.400157 E 0.950088
2 B 0.400157 B 1.867558
3 C 0.978738 B 1.867558
列作为元组的变量参数传递,每个元组由来自左侧数据帧的列、来自右侧数据帧的行和联接运算符组成,联接运算符可以是(>,<,>=,<=,!=)中的任何一个。在上面的示例中,由于列名重叠,返回了一个MultiIndex列。
就性能而言,这比天真的交叉连接要好:
np.random.seed(0)
dd = pd.DataFrame({'value':np.random.randint(100000, size=50_000)})
df = pd.DataFrame({'start':np.random.randint(100000, size=1_000),
'end':np.random.randint(100000, size=1_000)})
dd.head()
value
0 68268
1 43567
2 42613
3 45891
4 21243
df.head()
start end
0 71915 47005
1 64284 44913
2 13377 96626
3 75823 38673
4 29151 575
%%timeit
out = df.merge(dd, how='cross')
out.loc[(out.start < out.value) & (out.end > out.value)]
5.12 s ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit df.conditional_join(dd, ('start', 'value' ,'<'), ('end', 'value' ,'>'))
280 ms ± 5.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit df.conditional_join(dd, ('start', 'value' ,'<'), ('end', 'value' ,'>'), use_numba=True)
124 ms ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
out = df.merge(dd, how='cross')
out = out.loc[(out.start < out.value) & (out.end > out.value)]
A = df.conditional_join(dd, ('start', 'value' ,'<'), ('end', 'value' ,'>'))
columns = A.columns.tolist()
A = A.sort_values(columns, ignore_index = True)
out = out.sort_values(columns, ignore_index = True)
A.equals(out)
True
根据数据大小,当存在等连接时,可以获得更高的性能。在这种情况下,使用panda合并函数,但最终数据帧被延迟,直到计算出非等连接。当存在同等条件时,没有数字支持。让我们看看这里的数据:
import pandas as pd
import numpy as np
import random
import datetime
def random_dt_bw(start_date,end_date):
days_between = (end_date - start_date).days
random_num_days = random.randrange(days_between)
random_dt = start_date + datetime.timedelta(days=random_num_days)
return random_dt
def generate_data(n=1000):
items = [f"i_{x}" for x in range(n)]
start_dates = [random_dt_bw(datetime.date(2020,1,1),datetime.date(2020,9,1)) for x in range(n)]
end_dates = [x + datetime.timedelta(days=random.randint(1,10)) for x in start_dates]
offerDf = pd.DataFrame({"Item":items,
"StartDt":start_dates,
"EndDt":end_dates})
transaction_items = [f"i_{random.randint(0,n)}" for x in range(5*n)]
transaction_dt = [random_dt_bw(datetime.date(2020,1,1),datetime.date(2020,9,1)) for x in range(5*n)]
sales_amt = [random.randint(0,1000) for x in range(5*n)]
transactionDf = pd.DataFrame({"Item":transaction_items,"TransactionDt":transaction_dt,"Sales":sales_amt})
return offerDf,transactionDf
offerDf,transactionDf = generate_data(n=100000)
offerDf = (offerDf
.assign(StartDt = offerDf.StartDt.astype(np.datetime64),
EndDt = offerDf.EndDt.astype(np.datetime64)
)
)
transactionDf = transactionDf.assign(TransactionDt = transactionDf.TransactionDt.astype(np.datetime64))
# you can get more performance when using ints/datetimes
# in the equi join, compared to strings
offerDf = offerDf.assign(Itemr = offerDf.Item.str[2:].astype(int))
transactionDf = transactionDf.assign(Itemr = transactionDf.Item.str[2:].astype(int))
transactionDf.head()
Item TransactionDt Sales Itemr
0 i_43407 2020-05-29 692 43407
1 i_95044 2020-07-22 964 95044
2 i_94560 2020-01-09 462 94560
3 i_11246 2020-02-26 690 11246
4 i_55974 2020-03-07 219 55974
offerDf.head()
Item StartDt EndDt Itemr
0 i_0 2020-04-18 2020-04-19 0
1 i_1 2020-02-28 2020-03-07 1
2 i_2 2020-03-28 2020-03-30 2
3 i_3 2020-08-03 2020-08-13 3
4 i_4 2020-05-26 2020-06-04 4
# merge on strings
merged_df = pd.merge(offerDf,transactionDf,on='Itemr')
classic_int = merged_df[(merged_df['TransactionDt']>=merged_df['StartDt']) &
(merged_df['TransactionDt']<=merged_df['EndDt'])]
# merge on ints ... usually faster
merged_df = pd.merge(offerDf,transactionDf,on='Item')
classic_str = merged_df[(merged_df['TransactionDt']>=merged_df['StartDt']) &
(merged_df['TransactionDt']<=merged_df['EndDt'])]
# merge on integers
cond_join_int = (transactionDf
.conditional_join(
offerDf,
('Itemr', 'Itemr', '=='),
('TransactionDt', 'StartDt', '>='),
('TransactionDt', 'EndDt', '<=')
)
)
# merge on strings
cond_join_str = (transactionDf
.conditional_join(
offerDf,
('Item', 'Item', '=='),
('TransactionDt', 'StartDt', '>='),
('TransactionDt', 'EndDt', '<=')
)
)
%%timeit
merged_df = pd.merge(offerDf,transactionDf,on='Item')
classic_str = merged_df[(merged_df['TransactionDt']>=merged_df['StartDt']) &
(merged_df['TransactionDt']<=merged_df['EndDt'])]
292 ms ± 3.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
merged_df = pd.merge(offerDf,transactionDf,on='Itemr')
classic_int = merged_df[(merged_df['TransactionDt']>=merged_df['StartDt']) &
(merged_df['TransactionDt']<=merged_df['EndDt'])]
253 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
(transactionDf
.conditional_join(
offerDf,
('Item', 'Item', '=='),
('TransactionDt', 'StartDt', '>='),
('TransactionDt', 'EndDt', '<=')
)
)
256 ms ± 9.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
(transactionDf
.conditional_join(
offerDf,
('Itemr', 'Itemr', '=='),
('TransactionDt', 'StartDt', '>='),
('TransactionDt', 'EndDt', '<=')
)
)
71.8 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# check that both dataframes are equal
cols = ['Item', 'TransactionDt', 'Sales', 'Itemr_y','StartDt', 'EndDt', 'Itemr_x']
cond_join_str = cond_join_str.drop(columns=('right', 'Item')).set_axis(cols, axis=1)
(cond_join_str
.sort_values(cond_join_str.columns.tolist())
.reset_index(drop=True)
.reindex(columns=classic_str.columns)
.equals(
classic_str
.sort_values(classic_str.columns.tolist())
.reset_index(drop=True)
))
True