BeautifulSoup
如果我知道必须提前从中提取文本的标签(以便我可以应用
soup.findAll(specific_tag)
),但我的情况并非如此。它们可以是多个标记,我必须从中提取文本。例如-
<p>Science</p><div> Biology </div><div>Generation of mature T cells from human hematopoietic stem and progenitor cells in artificial thymic organoids. <span style=\"text-decoration: underline;\">Nature Methods</span> 2017,</div>
<p>
标记和
<div>
标签
import re
import copy
from html.parser import HTMLParser
from sample_htmls import *
class HTMLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.feeds = []
self.sentence = ''
self.current_path = []
self.tree = []
self.lookup_tags = ['div', 'span', 'p', 'ul', 'li']
def update_feed(self):
self.tree.append(copy.deepcopy(self.current_path))
self.current_path[:] = []
self.feeds.append(re.sub(' +', ' ', self.sentence).strip())
self.sentence = ''
def handle_starttag(self, tag, attrs):
if tag in self.lookup_tags:
if tag == 'li' and len(self.current_path) > 0:
self.update_feed()
self.current_path.append(tag)
def handle_endtag(self, tag):
if tag in self.lookup_tags:
self.current_path.append(tag)
if tag == self.current_path[0]:
self.update_feed()
def handle_data(self, data):
self.sentence += ' ' + data
def get_tree(self):
return self.tree
def get_data(self):
return [x for x in self.feeds if x]
在上面的示例中运行代码
parser = HTMLStripper()
parser.feed(mystr)
l1 = parser.get_tree()
feed = parser.get_data()
print(l1)
print("\n", mystr)
print("\n", feed)
print("\n\n")
和输出-
[['ul'], ['li', 'li'], ['li', 'li'], ['li', 'li'], ['li', 'li'], ['ul']]
<ul><li>Registered Nurse in <font>Missouri</font>, License number <font>xxxxxxxx</font>, <font>2017</font></li><li>AHA Advanced Cardiac Life Support (ACLS) Certification <font>2016-2018</font></li><li>AHA PALS - Pediatric Advanced Life Support 2017-2019</li><li>AHA Basic Life Support 2016-2018</li></ul>
['Registered Nurse in Missouri , License number xxxxxxxx , 2017', 'AHA Advanced Cardiac Life Support (ACLS) Certification 2016-2018', 'AHA PALS - Pediatric Advanced Life Support 2017-2019', 'AHA Basic Life Support 2016-2018']
也适用于混合标记html字符串-
[['p', 'p'], ['div', 'div'], ['div', 'span', 'span', 'div']]
<p>Science</p><div> Biology </div><div>Generation of mature T cells from human hematopoietic stem and progenitor cells in artificial thymic organoids. <span style="text-decoration: underline;">Nature Methods</span> 2017,</div>
['Science', 'Biology', 'Generation of mature T cells from human hematopoietic stem and progenitor cells in artificial thymic organoids. Nature Methods 2017,']