也许这不是一个最佳的解决方案,但它在这个小数据帧上完成了工作:
编辑
增加了对公司名称的检查,假设我们删除了标点符号
df = pd.DataFrame({"company": ['A', 'A', 'A', 'B', 'B'],
"address": ['16D Bayberry Rd, New Bedford, MA, 02740, USA',
'MA, USA',
'USA',
'New Bedford, MA, USA',
'MA, USA']})
# Splitting addresses by column and making sets from every address to use "issubset" later
addresses = list(df['address'].apply(lambda x: set(x.split(', '))).values)
companies = list(df['company'].values)
rows_to_drop = [] # Storing row indexes to drop here
# Iterating by every address
for i, (address, company) in enumerate(zip(addresses, companies)):
# Iteraing by the remaining addresses
rem_addr = addresses[:i] + addresses[(i + 1):]
rem_comp = companies[:i] + companies[(i + 1):]
for other_addr, other_comp in zip(rem_addr, rem_comp):
# If address is a subset of another address, add it to drop
if address.issubset(other_addr) and company == other_comp:
rows_to_drop.append(i)
break
df = df.drop(rows_to_drop)
print(df)
company address
0 A 16D Bayberry Rd, New Bedford, MA, 02740, USA
3 B New Bedford, MA, USA