currently I have a piece of code that spends most of its time on the above two sentences of data filtering in dataframe.
temp_df = df [df ["data_date"] .isin (date_list)]
temp = temp_df [rule [2]] [temp_df ["data_date"] = = d]
at present, it takes about 30-50 seconds to execute the filter rule with only four conditions. Is there any good optimization method?
because more than 10 or more rule may be added later for comprehensive filtering.
attached code and performance data
Function: digg_data_byrule at line 56
Line -sharp Hits Time Per Hit % Time Line Contents
==============================================================
56 def digg_data_byrule(df,rules):
57 1546 1398.0 0.9 0.0 info=""
58 7730 6465.0 0.8 0.0 for rule in rules:
59 6184 2076181.0 335.7 9.4 date_list=utils.datelist(rule[0],rule[1])
60 6184 9205.0 1.5 0.0 days = len(date_list)
61 6184 4966183.0 803.1 22.6 temp_df = df[df["data_date"].isin(date_list)]
62 6184 7782.0 1.3 0.0 flag = False
63 6184 20854.0 3.4 0.1 if len(temp_df) == days:
64 3100 2613.0 0.8 0.0 temp_array = []
65 16960 13525.0 0.8 0.1 for d in date_list:
66 13860 14703156.0 1060.8 66.9 temp = temp_df[rule[2]][temp_df["data_date"] == d]
67 13860 72778.0 5.3 0.3 if len(temp) > 0:
68 13860 62733.0 4.5 0.3 temp_array.append(temp.values[0])
69 7102 8265.0 1.2 0.0 for i in range(len(temp_array)):
70 7048 4693.0 0.7 0.0 if i > 0:
71 3948 9257.0 2.3 0.0 if (temp_array[i] - temp_array[i - 1]) / temp_array[i - 1] > rule[4]:
72 902 608.0 0.7 0.0 flag = True
73 else:
74 3046 1805.0 0.6 0.0 flag = False
75 3046 2206.0 0.7 0.0 break
76 6184 4376.0 0.7 0.0 if flag==True:
77 54 181.0 3.4 0.0 info += "%s:%s:%d:%.2f;" % (rule[3], rule[2], len(date_list), rule[4])
78 1546 874.0 0.6 0.0 return info
Total time: 40.1706 s
File: test.py
Function: main_test at line 16
def digg_data2(df,rules):
temp_df= df.groupby("product_id").apply(digg_data_byrule,rules)
temp_df=temp_df[temp_df!=""].reset_index()
temp_df.rename(columns={0:"flag"},inplace=True)
temp_df=temp_df.set_index("product_id")
return temp_df.to_dict("index")
def digg_data_byrule(df,rules):
info=""
for rule in rules:
date_list=utils.datelist(rule[0],rule[1])
days = len(date_list)
temp_df = df[df["data_date"].isin(date_list)]
flag = False
if len(temp_df) == days:
temp_array = []
for d in date_list:
temp = temp_df[rule[2]][temp_df["data_date"] == d]
if len(temp) > 0:
temp_array.append(temp.values[0])
for i in range(len(temp_array)):
if i > 0:
if (temp_array[i] - temp_array[i - 1]) / temp_array[i - 1] > rule[4]:
flag = True
else:
flag = False
break
if flag==True:
info += "%s:%s:%d:%.2f;" % (rule[3], rule[2], len(date_list), rule[4])
return info
rules = [
(max_date - timedelta(days=5), max_date, "product_add_cart", "increase_percent", 0.25),
(max_date - timedelta(days=5), max_date, "amount", "increase_percent", 0.25),
(max_date - timedelta(days=2), max_date, "product_add_cart", "increase_percent", 0.15),
(max_date - timedelta(days=2), max_date, "amount", "increase_percent", 0.15),
]
data = pd.read_sql(sql, db.engine)
data = data.groupby(["data_date", "product_id"]).sum().reset_index()
temp1 = digg_data2(data, rules)