import re
from collections import defaultdict, Counter
def create_dict(text):
" Dictionary contains strings for each paragraph using paragraph ID as key"
d = defaultdict(lambda: "")
lines = text.splitlines()
for line in lines:
matchObj = re.match( r'<P ID=(\d+)>', line)
if matchObj:
dictName = matchObj.group(0)
continue #skip line containing paragraph ID
elif re.match(r'</P>', line):
continue #skip line containing paragraph ending token
d[dictName] += line.lower()
return d
def document_frequency(d):
" frequency of words in document "
c = Counter()
for paragraph in d.values():
words = re.findall(r'\w+', paragraph)
c.update(words)
return c
def paragraph_frequency(d):
"Frequency of words in paragraph "
c = Counter()
for sentences in d.values():
words = re.findall(r'\w+', sentences)
set_words = set(words) # Set causes at most one occurrence
# of word in paragraph
c.update(set_words)
return c
text = """<P ID=1>
I have always wanted to try like, multiple? Different rasteraunts. Not quite sure which kind, maybe burgers!
</P>
<P ID=2>
Nice! I love burgers. Cheeseburgers, too. Have you ever gone to a diner type restauraunt? I have always wanted to try every diner in the country.
</P>
<P ID=3>
I am not related to the rest of these paragraphs at all.
</P>"""
d = create_dict(text)
doc_freq = document_frequency(d) # Number of times in document
para_freq = paragraph_frequency(d) # Number of times in paragraphs
print("document:", doc_freq)
print("paragraph: ", para_freq)
结果
document: Counter({'i': 4, 'to': 4, 'have': 3, 'always': 2, 'wanted': 2, 'try': 2, 'not': 2,'burgers': 2, 'diner': 2, 'the': 2, 'like': 1, 'multiple': 1, 'different': 1, 'rasteraunts':1, 'quite': 1, 'sure': 1, 'which': 1, 'kind': 1, 'maybe': 1, 'nice': 1, 'love': 1, 'cheeseburgers': 1, 'too': 1, 'you': 1, 'ever': 1, 'gone': 1, 'a': 1, 'type': 1, 'restauraunt': 1, 'every': 1, 'in': 1, 'country': 1, 'am': 1, 'related': 1, 'rest': 1, 'of': 1, 'these': 1, 'paragraphs': 1, 'at': 1, 'all': 1})
paragraph: Counter({'to': 3, 'i': 3, 'try': 2, 'have': 2, 'burgers': 2, 'wanted': 2, 'always': 2, 'not': 2, 'the': 2, 'which': 1, 'multiple': 1, 'quite': 1, 'rasteraunts': 1, 'kind': 1, 'like': 1, 'maybe': 1, 'sure': 1, 'different': 1, 'love': 1, 'too': 1, 'in': 1, 'restauraunt': 1, 'every': 1, 'nice': 1, 'cheeseburgers': 1, 'diner': 1, 'ever': 1, 'a': 1, 'type': 1, 'you': 1, 'country': 1, 'gone': 1, 'at': 1, 'related': 1, 'paragraphs': 1, 'rest': 1, 'of': 1,'am': 1, 'these': 1, 'all': 1})