- # 从 Ajax 入手解决主页面无数据问题
- import requests
- from hashlib import md5
- import os
- from config import *
- from requests.exceptions import RequestException
- from urllib.parse import urlencode
- import JSON
- from bs4 import BeautifulSoup
- from multiprocessing import Pool
- import re
- import pymongo
- client = pymongo.MongoClient(MONGO_URL)
- db = client[MONGO_DB]
- # Mongo 服务要先启动, 并创建你选定数据库, 否则连接不上
- def save_to_mongo(ret_dict):
- if db[MONGO_TABLE].insert(ret_dict): # 知识点 8:MongoDB 数据库的链接, 配置文件方式传入
- print("插入数据到数据库成功", ret_dict["title"])
- return True
- return False
- def get_page_index(keyword,offset,headers,cookie):
- try:
- data = {
- 'aid':"24",
- 'app_name':"web_search",
- 'offset':offset,
- 'format':"json",
- 'keyword':keyword,
- 'autoload':"true",
- 'count':"20",
- 'en_qc':"1",
- 'cur_tab':"1",
- 'from':'search_tab',
- 'pd':"synthesis",
- 'timestamp':"1585525783382",
- '_signature':"MqqdBAAgEBC1BxnpKjcMhjKr3BAAGwyzftELDyc2Vi7Ug4gGwX7WlzBBtoBfhTP9rT-Eha5MhBFoxSsOVuYXGF4F1L2sGmX9A07QT2rsGhAXHp38jFF3LG2nRBQu9o52X09"
- }
- # urllib 库的编码方式
- url = "https://www.toutiao.com/api/search/content/?"+urlencode(data)
- response = requests.get(url,headers=headers,cookies=cookie)
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- print("Wrong! 请求索引失败")
- return None
- # 拿 url
- def parse_page_index(html):
- """构造生成器即可, 或者这个函数的返回值是一个列表"""
- data = JSON.loads(HTML)
- if data and "data" in data.keys():
- for item in data.get("data"): # 知识点 3: 字典获取键的值的 get 方法
- if "article_url" in item.keys():
- url = item.get("article_url")
- yield url
- # 拿组图细节
- def get_page_detail(url,headers,cookie):
- try:
- response = requests.get(url,headers=headers,cookies=cookie)
- if response.status_code == 200:
- content = response.content.decode()
- return content
- return None
- except RequestException:
- print("get 函数出错")
- return None
- # 下载图片
- def download(url,headers,cookie):
- print("正在下载图片",url)
- try:
- response = requests.get(url, headers=headers, cookies=cookie)
- if response.status_code == 200:
- content = response.content
- saveimg(content)
- return None
- except RequestException:
- print("请求出错")
- return None
- # 保存图片
- def saveimg(content):
- file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg") # 知识点 9: 运用 md5 进行去重, md5 的简单回顾
- if not os.path.exists(file_path): # 知识点 10:os 方法的使用
- with open(file_path,"wb") as f:
- f.write(content)
- def parse_page_detail(HTML, url,headers,cookie):
- soup = BeautifulSoup(HTML, 'lxml')
- result = soup.select('title')
- title = result[0].get_text() if result else ''
- # print(title)
- images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
- result = re.search(images_pattern, HTML)
- # print(result)
- if result:
- ret = result.group(1)
- ret = ret.replace("\\", "")
- ret = ret.replace("u002F", "/")
- data = JSON.loads(ret)
- if data and 'sub_images' in data.keys():
- sub_images = data.get('sub_images')
- # print(sub_images)
- images = [item.get('url') for item in sub_images]
- for image in images: download(image,headers,cookie)
- return {
- 'title': title,
- 'url': url,
- 'images': images
- }
- def main(offset):
- headers = {'user-agent':'xxx'}
- cookie = {'cookie':'xxx'}
- HTML = get_page_index("街拍",offset,headers,cookie)
- for url in parse_page_index(HTML):
- HTML = get_page_detail(url,headers,cookie)
- if HTML:
- result = parse_page_detail(HTML, url,headers,cookie)
- if result:
- print(result)
- save_to_mongo(result)
- if __name__ == "__main__":
- # main()
- groups = [x*20 for x in range(GROUP_START,GROUP_END+1)]
- pool = Pool()
- pool.map(main,groups)
来源: http://www.bubuko.com/infodetail-3483936.html