1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
import requests,re import pandas as pd from functools import reduce from bs4 import BeautifulSoup
def checkEn(strs): for _char in strs: if '\u4e00' <= _char <= '\u9fd5': return True return False
def getInfoFromPubmed(strs): url = "https://pubmed.ncbi.nlm.nih.gov/?term="+strs session = requests.Session() response = session.get(url) soup = BeautifulSoup(response.text,'lxml') cite = soup.find("span", class_="cit").get_text().strip() authors = soup.find_all("a", class_="full-name") authors = [item.get_text().strip() for item in authors] func = lambda authors,i: authors if i in authors else authors + [i] authors_set = reduce(func, [[], ] + authors) author = ','.join(authors_set) doi = soup.find("span", class_="citation-doi").get_text().strip() journal = soup.find("button", class_="journal-actions-trigger trigger").get_text().strip() year = cite.split(";")[0] vol = cite.split(";")[1].split(":")[0] if "(" in vol: pat = re.compile('\((.*?)\)') no = re.findall(pat,vol)[0] vol = vol.split("(")[0] else: no = None page = cite.split(":")[-1].strip(".") return author,year,vol,no,page,doi,journal
def getInfoFromJustscience(journal): url = "http://sci.justscience.cn/?q="+journal session = requests.Session() response = session.get(url) form = pd.read_html(response.text)[1] for index in form.index: target = form.loc[index,'期刊缩写'] if target == journal: impact = form.loc[index,'影响因子'] else: impact = None return impact
if __name__ == '__main__': data = pd.read_excel("论文成果.xls",index_col=0,header=1) for index in data.index: title = data.loc[index,"论文名称"] status = checkEn(title) try: if not status: author,year,vol,no,page,doi,journal = getInfoFromPubmed(title) impact = getInfoFromjustscience(journal) data.loc[index,'作者名称'] = author data.loc[index,'发表时间'] = year data.loc[index,'卷号'] = vol data.loc[index,'期号'] = no data.loc[index,'页面范围'] = page data.loc[index,'DOI码'] = doi data.loc[index,'影响因子'] = impact except: pass data.to_excel("result.xlsx")
|