所以问题是:
给定表单中的数据集
1,2
1,3
1,4
3,1
6,5
5,7
识别链条
1 -> 2 -> 3 -> 4
和
5 -> 6 -> 7
并将其输出为
1,2
1,3
1,4
5,6
5,7
这里有一个python的工作解决方案。(节日快乐)
要运行它,请使用
python thisfile.py yourdata.csv > output.csv
当然,您需要安装python3。
代码中有大量注释。我根本没有考虑效率,所以把水壶放上——大约需要15分钟左右才能完成。
如果你想让它更快,这就是清单。需要花费时间的append()调用。使用numpy可能会加快速度,但我不想添加额外的依赖项。
如果你有任何问题,请告诉我。
示例数据的输出为:
3856 , 18216
3856 , 59888
12513 , 12812
12513 , 19922
12513 , 52188
import csv
import sys
def flatten(l):
return list(set([item for subl in l for item in subl]))
def main():
# read the csv
raw_data = []
with open(sys.argv[1], 'r') as so_data:
lst = csv.reader(so_data)
for row in lst:
raw_data.append(row)
data = []
for i in raw_data:
data.append([int(i[0].strip()), int(i[1].strip())])
#set keys to uniq elements from col1 and col2
keys1 = list(set([i[0] for i in data]))
keys2 = list(set([i[1] for i in data]))
keys = list(set(keys1 + keys2))
# find the subchains for each key
subchains = {}
for key in keys:
found = [k for k in data if key in k]
found = flatten(found)
subchains[key] = found
# This is the tricky bit
# we need to follow each element of the subchains looking
# for new links - we are done for a key when the list doesn't grow
chain, links = [], []
chain_dict = {}
for key in keys:
links.append(subchains[key])
links = flatten(links)
done = False
size = len(links)
while not done:
for i in links:
# find subchain for i
for e in subchains[i]:
chain.append(e)
chain = list(set(chain))
chain.sort()
if len(chain) > size:
done = False
size = len(chain)
else:
done = True
chain_dict[key] = chain
chain, links = [], []
# shorter chains will now be subsets of longer ones
# and those can be discarded
remove_list = []
for i in keys:
for j in keys:
if set(chain_dict[j]) < set(chain_dict[i]):
remove_list.append(j)
remove_list = list(set(remove_list))
for i in remove_list:
del chain_dict[i]
# remove duplicate values from dict
# it doesn't matter which key we remove
# since we only output from value
result = {}
for key, value in chain_dict.items():
if value not in result.values():
result[key] = value
# now output as per OP's request
for k, v in result.items():
v.sort()
for i in v[1:]:
print(v[0], ",", i)
if __name__ == '__main__':
main()