代码之家  ›  专栏  ›  技术社区  ›  Adam_G

从维基百科表中提取URL

  •  -2
  • Adam_G  · 技术社区  · 7 年前

    我正在使用下面的代码,取自 here ,以刮取维基百科表。除了测试信息,我想访问每个国家/地区的每个超链接,并从每个页面复制文本。有办法做到这一点吗 BeautifulSoup ?

    # -*- coding: utf-8 -*-
    """
    Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for
    each table
    """
    
    from bs4 import BeautifulSoup
    import urllib.request
    import os
    import codecs
    
    # wiki = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects"
    wiki = 'https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order'
    header = {'User-Agent': 'Mozilla/5.0'}  # Needed to prevent 403 error on Wikipedia
    req = urllib.request.Request(wiki, headers=header)
    page = urllib.request.urlopen(req)
    soup = BeautifulSoup(page, "html.parser")
    
    tables = soup.findAll("table", {"class": "wikitable"})
    
    # show tables
    # for table in tables:
    #     print("###############")
    #     print(table)#.text)#[:100])
    
    for tn in range(len(tables)):
        table = tables[tn]
    
        # preinit list of lists
        rows = table.findAll("tr")
        row_lengths = [len(r.findAll(['th', 'td'])) for r in rows]
        ncols = max(row_lengths)
        nrows = len(rows)
        data = []
        for i in range(nrows):
            rowD = []
            for j in range(ncols):
                rowD.append('')
            data.append(rowD)
    
        # process html
        for i in range(len(rows)):
            row = rows[i]
            rowD = []
            cells = row.findAll(["td", "th"])
            for j in range(len(cells)):
                cell = cells[j]
    
                # lots of cells span cols and rows so lets deal with that
                cspan = int(cell.get('colspan', 1))
                rspan = int(cell.get('rowspan', 1))
                for k in range(rspan):
                    for l in range(cspan):
                        data[i + k][j + l] += cell.text
    
            data.append(rowD)
    
            # write data out
    
            page = os.path.split(wiki)[1]
        fname = 'output_{}_t{}.csv'.format(page, tn)
        f = codecs.open(fname, 'w')  # ,encoding='utf-8')
        for i in range(nrows):
            rowStr = ','.join(data[i])
            rowStr = rowStr.replace('\n', '')
            # print(rowStr)
            rowStr = rowStr  # .encode('unicode_escape')
            f.write(rowStr + '\n')
    
        f.close()
    
    1 回复  |  直到 7 年前
        1
  •  1
  •   alexisdevarennes    7 年前
    from bs4 import BeautifulSoup
    import requests
    wiki_url = 'https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order'
    print('Fetching main wiki article: %s' % wiki_url)
    page = requests.get(wiki_url).text
    print('Done. Extracting table links..')
    html = BeautifulSoup(page)
    table = html.find('table', 'wikitable')
    links = table.findAll('a')
    links_content = {}
    print('Done extracting links. About to fetch: %s links..' % len(links))
    for link in links:
        print('Fetching: %s' % link)
        links_content[link] = requests.get(link).text