任务:
爬取Scrape | Movie的十页数据,存入mongodb中
分析网页
通过分析数据是动态加载的,第一页数据的在api为
https://spa2.scrape.center/api/movie/?limit=10&offset=0&token=NmI0N2ZhZTE3ZDMyYWRjZGZhYTI1YTY4NDNkMGYxNzQxMDE1ZjlkMSwxNjQwNDIyNjU
里面有一个token加密参数
1 通过js调试 发现是通过sha1加密的
2 模拟加密,获取token值
t = int(time.time()) # 模拟时间
lis = ['/api/movie', 0, str(t)] # 模拟r
s = ','.join(lis) # 通过,拼接为str
md = hashlib.sha1() # 根据sha1加密
md.update(s.encode('utf-8'))
tok = md.hexdigest()
# print(tok)
L = [str(tok), str(t)] # 获得的token值放入列表中
p = ','.join(L) # 通过,拼接为字符串
to = base64.b64encode(p.encode()) # 经过base64 进行编码
token = to.decode('utf-8')
3 通过拼接网址,发现只能获取到第一页的数据,后面的数据获取不到
解决方案:
通过js调试,分析第二页的源码,发现lis是变化的,并不是固定的
lis = ['/api/movie', str(num), str(t)] # 模拟r
把num根据页面的变化进行动态赋值
贴上源码
import time
import hashlib
import base64,requests
from loguru import logger
from concurrent.futures.thread import ThreadPoolExecutor
import pymongo
class Spiders():
def __init__(self):
self.headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Cookie': 'UM_distinctid=17d70bc7c7c4a1-07defacd232fa1-5d11371e-1fa400-17d70bc7c7d789',
'Referer': 'https://spa2.scrape.center'
}
self.client = pymongo.MongoClient()
self.datebase = self.client['python']
def get_token(self,num):
t = int(time.time()) # 模拟时间
lis = ['/api/movie', str(num), str(t)] # 模拟r
s = ','.join(lis) # 通过,拼接为str
md = hashlib.sha1() # 根据sha1加密
md.update(s.encode('utf-8'))
tok = md.hexdigest()
# print(tok)
L = [str(tok), str(t)] # 获得的token值放入列表中
p = ','.join(L) # 通过,拼接为字符串
to = base64.b64encode(p.encode()) # 经过base64 进行编码
token = to.decode('utf-8') # 去掉bytes
return token
def get_data(self,page,n):
token = self.get_token(n)
url = 'https://spa2.scrape.center/api/movie/?limit=10&offset={}&token={}'.format(page,token)
print(url)
res = requests.get(url, headers=self.headers)
items = res.json()['results']
for i in items:
title = i['name']
categories = i['categories']
type = ' '.join(categories)
time = i['published_at']
minute = i['minute']
score = i['score']
add = i['regions']
addr = ''.join(add)
logger.info(f'片名{title},类型{type},上映时间{time},时长{minute},评分{score},地区{addr}')
dic = {
'片名':title,
'类型':type,
'上映时间':time,
'时长':minute,
'评分':score,
'地区':addr
}
self.save(dic)
def save(self,data):
try:
if isinstance(data,dict):
self.datebase['电影排行榜'].insert_one(data)
else:
return '数据格式有误{}'.format(type(data))
except Exception as e :
return '数据有误%s'%e
def run(self):
with ThreadPoolExecutor(10) as f :
for i in range(0,91,10):
n = i
print(n)
f.submit(self.get_data,i,n)
if __name__ == '__main__':
obj = Spiders()
obj.run()