輸入串列:
['ABCD','AACD','ABEF','ABEGG']
字串是用點連接的變數。因此,在第一個字串中,變數是 ABCD,但在最后一個字串中變數是 ABE GG(變數的長度可以不同),但字串將始終具有相同數量的變數,以點分隔。我想將那些只有一個不同變數的字串組合在一起。所以上面會產生2組。
['ABCD','AACD'] ['ABEF','ABEGG']
差異必須在同一變數中。所有變數之間沒有一個差異。并且每個字串應該只包含一次,而不是跨組多次。
真實資料可能有多達 20 個變數(但每個串列中的所有專案將始終具有相同數量的變數),并且串列可能有多個 minion 字串,因此性能也是一個問題。
我寫了一個演算法來實作,但是太復雜了。我也通過 itertools groupby 嘗試過,但無法讓它產生正確的結果:
import itertools
import difflib
class Grouper:
def __init__(self, diff):
self.last = None
self.diff = diff
def get_diffs(self, curr_key):
if self.last == None:
return []
dims = curr_key.split('.')
previous_dims = self.last.split('.')
diffs = list((dm for dm in difflib.ndiff(dims, previous_dims) if '-' not in dm and '?' not in dm))
return [n for n, d in enumerate(diffs) if ' ' in d]
def __call__(self, item):
result = self.get_diffs(item)
print(item)
self.last = item
return result
grp = Grouper(data)
groups = []
uniquekeys = []
for k, g in itertools.groupby(data, grp):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)
uj5u.com熱心網友回復:
我添加了另一個字串“ABCF”,這個字串可以與“ABCD”和“ABEF”匹配:
import itertools
def main(l: list) -> list:
splits = [tuple(s.split(".")) for s in l]
groups = {}
# Dict of {(tuple, index of difference): list of tuples that match the key}
for split in splits:
matched = False
for (t, i) in groups.keys():
# We match two splits if they only have one difference
diffs = [i for i in range(len(split)) if split[i] != t[i]]
if len(diffs) == 1:
# The strings match but is the first match of 't'?
if i is None:
groups.pop((t, i))
groups[(t, diffs[0])] = [split]
# We found a match, stop the matching of this tuple
matched = True
break
elif diffs[0] == i:
groups[(t, i)].append(split)
matched = True
break
if not matched:
# Let's add this split as a candidate for future splits
groups[(split, None)] = []
return [[".".join(k)] [".".join(s) for s in v] for (k, i), v in groups.items()]
print(main(["A.B.C.D", "A.A.C.D", "A.B.E.F", "A.B.E.GG", "A.B.C.F"]))
# [['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG'], ['A.B.C.F']]
print(main(["A.B.C", "A.B.D", "A.E.D"]))
# [['A.B.C', 'A.B.D'], ['A.E.D']]
uj5u.com熱心網友回復:
使用貪心演算法解決集合覆寫問題:
string_list = ['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG', 'A.B.C.F']
list_list = [s.split('.') for s in string_list]
# [['A', 'B', 'C', 'D'], ['A', 'A', 'C', 'D'], ...]
universe = set(range(len(string_list)))
# {0, 1, 2, 3, 4}
# represents string_list by index
# for instance, 3 represents 'A.B.E.GG'
subsets = {frozenset(j for j,l in enumerate(list_list) if l[:i]==l0[:i] and l[i 1:]==l0[i 1:]) for l0 in list_list for i in range(len(l0))}
# {{2}, {2, 3}, {0, 1}, {0, 4}, {3}, {2, 4}, {1}, {4}, {0}}
# represents possible groupings by index
# for instance, {2, 3} represents {'A.B.E.F', 'A.B.E.GG'}
solution = []
while universe:
max_subset = max(subsets, key=len)
solution.append(max_subset)
universe -= max_subset
subsets.remove(max_subset)
subsets = {subset & universe for subset in subsets}
print(solution)
# {{2, 3}, {0, 1}, {4}}
clusters = [[string_list[i] for i in subset] for subset in solution]
print(clusters)
# [['A.B.E.F', 'A.B.E.GG'], ['A.B.C.D', 'A.A.C.D'], ['A.B.C.F']]
uj5u.com熱心網友回復:
您還可以使用遞回:
def get_groups(d):
def m(a, b):
return max(map(len, [(s1:=set(a.split('.'))) - (s2:=set(b.split('.'))), s2 - s1])) == 1
def groups(n, s = []):
if not (v:=[i for i in d if all(m(j, i) for j in n) and i not in s]):
yield [*set(n)]
if (k:=[i for i in d if i not in s [n]]):
yield from groups([k[0]], s=s n [k[0]])
else:
yield from groups(n v, s = s v)
return list(groups([d[0]], [d[0]]))
print(get_groups(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))
print(get_groups(["A.B.C", "A.B.D", "A.E.D"]))
輸出:
[['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']]
編輯:預排序輸入的解決方案:
def m(a, b):
return max(map(len, [(s1:=set(a.split('.'))) - (s2:=set(b.split('.'))), s2 - s1])) == 1
def groups(d):
v = []
for i in d:
if not v or all(m(j, i) for j in v):
v.append(i)
else:
yield v
v = [i]
yield from ([] if not v else [v])
print(list(groups(sorted(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))))
print(list(groups(sorted(["A.B.C", "A.B.D", "A.E.D"]))))
輸出:
[['A.A.C.D', 'A.B.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']]
時間:
import random, string, time
import contextlib
r_s = sorted(['.'.join(random.choice(string.ascii_uppercase) for _ in range(4)) for _ in range(1000000)])
@contextlib.contextmanager
def timeit(f):
t = time.time()
yield
print(f'{f} time: {time.time() - t}')
with timeit('get_groups'):
_ = [*get_groups(r_s)]
with timeit('groups'):
_ = [*groups(r_s)]
輸出:
... #'get_groups' hangs
groups time: 1.30812406539917
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/355673.html
下一篇:如何通過特定條件創建新列?
