3 回答
TA贡献1898条经验 获得超8个赞
最初,查找出现在输入列表的另一个字符串中的所有以“”分隔的子字符串。在此过程中,构建一个字典,其中包含所有相应的子字符串作为键,输入字符串作为值。这将返回一个只有单个子字符串作为键的字典。使用该示例返回:
{'by': ['Garden by KatSkill', 'Meadow by KatSkill', 'House by KatSkill'], 'KatSkill': ['Garden by KatSkill', 'Meadow by KatSkill', 'House by KatSkill'], 'Doghouse': ['Doghouse Antwerp', 'Doghouse Vienna', 'Doghouse Amsterdam']}
为了获得预期的结果,需要进行压实。对于压缩,利用每个字典键也是字典字符串列表的一部分这一事实是有益的。因此迭代字典值并将字符串再次拆分为子字符串。然后按照子串列表的顺序遍历子串,确定包含字典键的子串列表范围。将相应的范围添加到新的字典中。对于 24k 条目,这可能需要一段时间。请参阅下面的源代码:
mylist = [ 'Doghouse Amsterdam', 'Doghouse Antwerp', 'Doghouse Vienna',
'House by KatSkill', 'Garden by KatSkill', 'Meadow by KatSkill']
def findSimilarSubstrings(list):
res_dict = {}
for string in list:
substrings = string.split(" ")
for otherstring in list:
# Prevent check with the same string
if otherstring == string:
continue
for substring in substrings:
if substring in otherstring:
if not(substring in res_dict):
res_dict[substring] = []
# Prevent duplicates
if not(otherstring in res_dict[substring]):
res_dict[substring].append(otherstring)
return res_dict
def findOverlappingLists(dict):
res_dict = {}
for list in dict.values():
for string in list:
substrings = string.split(" ")
lastIndex = 0
lastKeyInDict = False
substring = ""
numsubstrings = len(substrings)
for i in range(len(substrings)):
substring = substrings[i]
if substring in dict:
if not(lastKeyInDict):
lastIndex = i
lastKeyInDict = True
elif lastKeyInDict:
commonstring = " ".join(substrings[lastIndex:i])
# Add key string to res_dict
if not(commonstring in res_dict):
res_dict[commonstring] = []
# Prevent duplicates
if not(string in res_dict[commonstring]):
res_dict[commonstring].append(string)
lastKeyInDict = False
# Handle last substring
if lastKeyInDict:
commonstring = " ".join(substrings[lastIndex:numsubstrings])
if not(commonstring in res_dict):
res_dict[commonstring] = []
if not(string in res_dict[commonstring]):
res_dict[commonstring].append(string)
return res_dict
# Initially find all the substrings (seperated by " ") returning:
# {'by': ['Garden by KatSkill', 'Meadow by KatSkill', 'House by KatSkill'],
# 'KatSkill': ['Garden by KatSkill', 'Meadow by KatSkill', 'House by KatSkill'],
# 'Doghouse': ['Doghouse Antwerp', 'Doghouse Vienna', 'Doghouse Amsterdam']}
similiarStrings = findSimilarSubstrings(mylist)
# Perform a compaction on similiarStrings.values() by lookup in the dictionary's key set
resultdict = findOverlappingLists(similiarStrings)
TA贡献1786条经验 获得超11个赞
这是一个可能更简单/更快的实现
from collections import Counter
from itertools import groupby
import pprint
# Strategy:
# 1. Find common words in strings in list
# 2. Group strings which have the same common words together
def find_common_words(lst):
" finds strings with common words "
cnt = Counter()
for s in lst:
cnt.update(s.split(" "))
# return words which appear in more than one string
words = set([k for k, v in cnt.items() if v > 1])
return words
def grouping_key(s, words):
" Key function for grouping strings with common words in the same sequence"
k = []
for i in s.split():
if i in words:
k.append(i)
return ' '.join(k)
def calc_groupings(lst):
" Generate the string groups based upon common words "
common_words = find_common_words(lst)
# Group strings with common words
g = groupby(lst, lambda x: grouping_key(x, common_words))
# Result
return {k: list(v) for k, v in g}
t = ['Doghouse Amsterdam', 'Doghouse Antwerp', 'Doghouse Vienna',
'House by KatSkill', 'Garden by KatSkill', 'Meadow by KatSkill']
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(calc_groupings(t))
输出
{ 'Doghouse': ['Doghouse Amsterdam', 'Doghouse Antwerp', 'Doghouse Vienna'],
'by KatSkill': [ 'House by KatSkill',
'Garden by KatSkill',
'Meadow by KatSkill']}
TA贡献1784条经验 获得超2个赞
mylist = [ 'Doghouse Amsterdam', 'Doghouse Antwerp', 'Doghouse Vienna',
'House by KatSkill', 'Garden by KatSkill', 'Meadow by KatSkill']
test = ['Doghouse', 'by KatSkill']
使用 dict 和列表理解:
res = { i: [j for j in mylist if i in j] for i in test}
或设置您的 dict 并使用带有列表理解的循环
resultdict = {}
for i in test:
resultdict[i] = [j for j in mylist if i in j]
添加回答
举报