爬取新闻标题及链接存储至MYSQL
网页链接: http://world.people.com.cn/
一. 首先要获取数据,将数据暂存于 list列表
二. 将数据存储至MySQL:
1.创建链接
2.创建游标
3.传入参数,执行命令
4.数据提交(提交至MySQL)
5.关闭游标、链接
源码
import random
import time
import pymysql
import requests
from lxml import etree
n = 1
# 请求url
url = 'http://world.people.com.cn/'
# 请求头
header = {
'user_agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36'
}
# 获取响应
response = requests.get(url=url, headers=header).content
# 解码
chi = response.decode('gbk')
# print(chi)
# 获取html
html = etree.HTML(chi)
# 通过xpath 解析 筛选数据
data = html.xpath('/html/body/div[15]/div[1]/div/div/p/strong/a')
# print(data)
for i in data:
# 定义列表,用于存储数据
list = []
# 获取标题
title = i.xpath('.//text()')[0]
# print(title)
url = i.xpath('./@href')[0]
# 如果url不完整, 则拼接url , 并将数据传入list列表
if 'http:' not in url:
full_url = 'http://world.people.com.cn/' + url
# print(full_url)
list.append([title, full_url])
else:
list.append([title, url])
print(list)
# 设置翻页间隔时间
time.sleep(random.randint(3, 5))
# MYSQL
# 1. 创建链接
conn = pymysql.connect(
host='127.0.0.1', # 本地MYSQL
user='root', # 用户名
password='00000', # 密码
port=3306, # 端口号, 默认就为3306,可写可不写
database='gradem', # 数据库名
charset='utf8' # 编码
)
# 2. 创建游标
cur = conn.cursor()
for l in list:
try:
# 3. 传入参数, 执行命令
cur.execute('insert into list(title, url) values(%s, %s)', (l[0], l[1]))
# 4. 数据提交 (提交至MySQL)
conn.commit()
except Exception as e:
# 数据回滚
conn.rollback()
print('第' + str(n) + '数据存储失败!')
n += 1
# 5. 关闭游标、连接
cur.close()
conn.close()