一种解决方案是将
CorpusReader
对象,并在该子类的构造函数中,通过从每个文件创建dict将每个文件与其元数据相关联
fileid
到CSV中的行。这样,任何可以访问语料库的文件都可以访问元数据。
例如:
import nltk
import csv
from nltk.corpus.reader.api import CorpusReader
class MetadataCSVCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding='utf8', tagset=None):
super().__init__(root, fileids, encoding='utf8', tagset=None)
self._parsed_metadata = {}
metadata = self.open('metadata.csv')
reader = csv.DictReader(metadata)
for row in reader:
self._parsed_metadata[row['fileid']] = row
@property
def metadata(self):
"""
Return the contents of the corpus metadata.csv file, if it exists.
"""
return self.open("metadata.csv").read()
@property
def parsed_metadata(self):
"""
Return the contents of the metadata.csv file as a dict
"""
return self._parsed_metadata