安装Beautiful Soup及html解析器lxml
pip install bs4
pip install lxml
解析新浪高考热讯并导入MongoDB
import requests
import time
from bs4 import BeautifulSoup
import pymongo
myclient = pymongo.MongoClient('mongodb://localhost:27017/')
mydb = myclient["gaokao"]
mycol = mydb["news"]
n = 0
for i in range(1,11):
url = 'http://edu.sina.com.cn/other/roll.d.html?cat=80459&page={}&page_size=30'.format(str(i))
strhtml = requests.get(url)
strhtml.encoding = 'utf8'
soup = BeautifulSoup(strhtml.text,'lxml')
data = soup.select('#Main > div.listBlk > ul > li > a')
for item in data:
web_xx.clear()
n += 1
m_num = 'sina'+str(n).rjust(6, '0')
web_xx.setdefault(m_num,{})
web_xx[m_num]['title'] = item.get_text()
c_url = item.get('href')
web_xx[m_num]['url'] = c_url
content = requests.get(c_url)
content.encoding = 'utf8'
soup_content = BeautifulSoup(content.text)
data1 = soup_content.select('#artibody')
for item1 in data1:
web_xx[m_num]['content'] = item1.get_text()
time.sleep(3)
mycol.insert_one(web_xx)
print(m_num+'入库成功!')