import requests from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = r.apparent_encodi...
需要爬取的网页:http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html 右键单击 -> 查看网页源代码: 找到需要爬取的信息的位置:(可以Ctrl+F 搜索 "清华大学" 快速找到位置) 自诩观察网页源代码可以...
import requests from bs4 import BeautifulSoup import bs4 def gethtml(url): '''获取html页面''' try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r...
import requests from bs4 import BeautifulSoup import re allUniv = [] def getHTMLText(url): try: r = requests.get(url,timeout = 30) r.raise_for_status() r...