解决了这个问题
finding largest common substring
问题
解决方案包括从url的每个字符构建一个解析树。树中的每个节点存储正数、负数和总数。最后,修剪树以返回最常见的模式。
代码:
def find_patterns(incoming_urls):
urls = {}
# make the tree
for url in incoming_urls:
url, atype = line.strip().split("____") # assuming incoming_urls is a list with each entry of type url__class
if len(url) < 100: # Take only the initial 100 characters to avoid building a sparse tree
bound = len(url) + 1
else:
bound = 101
for x in range(1, bound):
if url[:x].lower() not in urls:
urls[url[:x].lower()] = {'positive': 0, 'negative': 0, 'total': 0}
urls[url[:x].lower()][atype] += 1
urls[url[:x].lower()]['total'] += 1
new_urls = {}
# prune the tree
for url in urls:
if urls[url]['total'] < 5: # For something to be called as common pattern, there should be at least 5 occurrences of it.
continue
urls[url]['negative_percentage'] = (float(urls[url]['negative']) * 100) / urls[url]['total']
if urls[url]['negative_percentage'] < 85.0: # Assuming I am interested in finding url patterns for negative class
continue
length = len(url)
found = False
# iterate to see if a len+1 url is present with same total count
for second in urls:
if len(second) <= length:
continue
if url == second[:length] and urls[url]['total'] == urls[second]['total']:
found = True
break
# discard urls with length less than 20
if not found and len(url) > 20:
new_urls[url] = urls[url]
print "URL Pattern; Positive; Negative; Total; Negative (%)"
for url in new_urls:
print "%s; %d; %d; %d; %.2f" % (
url, new_urls[url]['positive'], new_urls[url]['negative'], new_urls[url]['total'],
new_urls[url]['negative_percentage'])