您需要在标记器中注册一个自定义后缀。这可以通过以下方式完成:
import re
import spacy
from spacy.tokenizer import Tokenizer
suffix_re = re.compile(r'''\.$''')
def custom_tokenizer(nlp):
return Tokenizer(nlp.vocab, suffix_search=suffix_re.search)
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp("The Eiffel Tower is very beautiful.")
print([t.text for t in doc])
doc2 = nlp("The Eiffel Tower is located at 48.86N 2.29E.")
print([t.text for t in doc2])
doc3 = nlp("The Eiffel Tower, Norte Dame and Champs Elysee are located at 48.86N 2.29E.")
print([t.text for t in doc3])
输出
['The', 'Eiffel', 'Tower', 'is', 'very', 'beautiful', '.']
['The', 'Eiffel', 'Tower', 'is', 'located', 'at', '48.86N', '2.29E', '.']
['The', 'Eiffel', 'Tower,', 'Norte', 'Dame', 'and', 'Champs', 'Elysee', 'are', 'located', 'at', '48.86N', '2.29E', '.']