此代码将使用Selenium提取所需的蛋白质序列。我修改了您的原始代码,以获得您想要的结果。
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
driver = webdriver.Firefox()
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
driver.get(newSearchString)
html = driver.page_source
newSoup = BeautifulSoup(html, "lxml")
ff_tags = newSoup.find_all(class_="ff_line")
aaList = []
for tag in ff_tags:
aaList.append(tag.text.strip().replace(" ",""))
protSeq = "".join(aaList)
return protSeq
except:
print("Please Enter a Valid Protein")
sequence = getSequence()
print(sequence)
为“p53”的输入生成以下输出:
meepqsdlsielplsqetfsdlwkllppnnvlstlpssdsieelflsenvtgwledsggalqgvaaaaastaedpvtetpapvasapatpwplsssvpsyktfqgdygfrlgflhsgtaksvtctyspslnklfcqlaktcpvqlwvnstpppgtrvramaiykklqymtevvrrcphherssegdslappqhlirvegnlhaeylddkqtfrhsvvvpyeppevgsdcttihynymcnsscmggmnrrpiltiitledpsgnllgrnsfevricacpgrdrrteeknfqkkgepcpelppksakralptntssspppkkktldgeyftlkirgherfkmfqelnealelkdaqaskgsedngahssylkskkgqsasrlkklmikregpdsd