Issue
What is the best way to implement the Apriori algorithm in pandas? So far I got stuck on transforming extracting out the patterns using for loops. Everything from the for loop onward does not work. Is there a vectorized way to do this in pandas?
import pandas as pd
import numpy as np
trans=pd.read_table('output.txt', header=None,index_col=0)
def apriori(trans, support=4):
ts=pd.get_dummies(trans.unstack().dropna()).groupby(level=1).sum()
#user input
collen, rowlen =ts.shape
#max length of items
tssum=ts.sum(axis=1)
maxlen=tssum.loc[tssum.idxmax()]
items=list(ts.columns)
results=[]
#loop through items
for c in range(1, maxlen):
#generate patterns
pattern=[]
for n in len(pattern):
#calculate support
pattern=['supp']=pattern.sum/rowlen
#filter by support level
Condit=pattern['supp']> support
pattern=pattern[Condit]
results.append(pattern)
return results
results =apriori(trans)
print results
When I insert this with support 3
a b c d e
0
11 1 1 1 0 0
666 1 0 0 1 1
10101 0 1 1 1 0
1010 1 1 1 1 0
414147 0 1 1 0 0
10101 1 1 0 1 0
1242 0 0 0 1 1
101 1 1 1 1 0
411 0 0 1 1 1
444 1 1 1 0 0
it should output something like
Pattern support
a 6
b 7
c 7
d 7
e 3
a,b 5
a,c 4
a,d 4
Solution
Assuming I understand what you're after, maybe
from itertools import combinations
def get_support(df):
pp = []
for cnum in range(1, len(df.columns)+1):
for cols in combinations(df, cnum):
s = df[list(cols)].all(axis=1).sum()
pp.append([",".join(cols), s])
sdf = pd.DataFrame(pp, columns=["Pattern", "Support"])
return sdf
would get you started:
>>> s = get_support(df)
>>> s[s.Support >= 3]
Pattern Support
0 a 6
1 b 7
2 c 7
3 d 7
4 e 3
5 a,b 5
6 a,c 4
7 a,d 4
9 b,c 6
10 b,d 4
12 c,d 4
14 d,e 3
15 a,b,c 4
16 a,b,d 3
21 b,c,d 3
[15 rows x 2 columns]
Answered By - DSM
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.