1 回答
TA贡献1797条经验 获得超4个赞
我不确定我是否理解了逻辑,看看这是否有帮助。
对于100000三重奏,需要41秒。
loc,get_loc是非常广泛的操作,所以把你的表放在字典里,而不是验证一切都是唯一的,把它放在一个集合中
import pandas as pd
import random
from collections import defaultdict as dd
from collections import Counter
import time
# create 100000 unique trios of numbers
ids = list(range(50000))
trios_set = set()
while len(trios_set)<100000:
trio = random.sample(ids,3)
trios_set.add(frozenset(trio))
ids_dict = dd(list) # a dictionery where id is the key and value is all the id who are partner with it in a list
for s in trios_set:
for id in s:
for other_id in s:
if id!= other_id:
ids_dict[id].append(other_id)
ids_dict = dict(ids_dict)
for_df = []
type_list = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n"]
for id in ids_dict:
massage = {}
massage["id"] = id
other_id_index = 1
for other_id in ids_dict[id]:
massage["id_"+str(other_id_index)] = other_id
other_id_index+=1
massage["type"] = random.choice(type_list)
for_df.append(massage)
df = pd.DataFrame(for_df) # a table with id column and all ids who are with it in trios in id_1 id_2.. and type column with a letter
#------------------------------------------------------------------
#till here we built the input table
start_time = time.time() #till here we build the input table, now check the time for 100000 atoms
type_dict = {}
from_df = dd(set)
for i,r in df.iterrows(): #move the dataframe to a dict of id as key and value as list of ids who connected to it
for col in df:
if "id_"in col and str(r[col])!="nan":
from_df[r["id"]].add(r[col])
type_dict[r["id"]] = r["type"] #save the type of id in a dictionery
from_df = dict(from_df)
out_trio_set = set()
for id in from_df:
for other_id in from_df[id]:
if other_id!= id and str(other_id)!="nan":
for third_id in from_df[other_id]:
current_trio = frozenset([id, other_id,third_id])
if len(current_trio)==3:
out_trio_set.add(current_trio)
type_conter = Counter()
for trio in out_trio_set:
type_list = []
for id in trio:
type_list.append(type_dict[id])
type_list = sorted(type_list)
atom_type = "".join(type_list)
type_conter[atom_type] +=1
out_df = pd.DataFrame(type_conter, index = [1]) # in here put index as timestamp
out_df.to_excel(r"D:\atom.xlsx")
print("--- %s seconds ---" % (time.time() - start_time))
添加回答
举报