你可以循环
content
每个的属性
BeautifulSoup
对象。要显示标签,只需使用
with_labels
属性
nx.draw
:
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from bs4 import BeautifulSoup as soup
ex0 = "<html><head><title>Are you lost ?</title></head><body><h1>Lost on the Intenet ?</h1><h1>Don't panic, we will help you</h1><strong><pre> * <----- you are here</pre></strong></body></html>"
d = soup(ex0, 'html.parser')
def _traverse_html(_d:soup, _graph:nx.Graph, _counter, _parent=None) -> None:
for i in _d.contents:
if i.name is not None:
try:
_name_count = _counter.get(i.name)
if _parent is not None:
_graph.add_node(_parent)
_graph.add_edge(_parent, i.name if not _name_count else f'{i.name}_{_name_count}')
_counter[i.name] += 1
_traverse_html(i, _graph, _counter, i.name)
except AttributeError:
pass
_full_graph = nx.Graph()
_traverse_html(d, _full_graph, defaultdict(int))
nx.draw(_full_graph, with_labels = True)
plt.show()