- import pymysql
- from urllib import request,parse
- from urllib.error import HTTPError,URLError
- def main(url,headers=None,data=None): # 调用函数
- if not data:
- return get_response(url,headers=headers)
- else:
- return get_response(url,headers=headers,data=data)
- def get_response(url,data=None,headers=None):
- if not headers:
- headers = {'User-Agent':get_agent()}
- try:
- if data:
- data = parse.urlencode(data)
- data = bytes(data,encoding='utf-8')
- req = request.Request(url, data=data, headers=headers)
- else:
- req = request.Request(url,headers=headers)
- response = request.urlopen(req)
- data = response.read().decode()
- return data # 返回数据
- except HTTPError as e: # 总的错误信息, 不适合用于调试
- print(e)
- except URLError as e:
- print(e)
- def get_agent(table=None): # 提前使用 fake_useragent 模块生成的请求头, 存储在数据库中, 避免出现问题无法调用 fake_useragent 模块
- table = 'p_useragent'
- conn = pymysql.connect('127.0.0.1', 'root', '123456', 'PaChong', charset='utf8')
- cursor = conn.cursor() # 连接数据库, 随机调用请求头
- sql = 'SELECT * FROM {} WHERE id>= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM p_useragent) LIMIT 1'.format(
- table, table, table)
- rwo = cursor.execute(sql)
- useragent = cursor.fetchall()[0][1]
- return useragent
- if __name__ == '__main__':
- url = 'http://fanyi.baidu.com/sug'
- data = {'kw':'中国'}
- import json
- res = json.loads(main(url,data=data))
- print(res)
- # url = 'http://www.baidu.com'
- # res = main(url)
- # print(res)
正常情况下, 每写一个爬虫, 都需要执行分析 ->请求 ->响应 ->下载 (存储) 的流程, 但诸多功能, 其实都是在重复造轮子, 比如请求, 调用请求头,post 请求 data 值, 可以将这些功能写到一个 py 文件里, 这样再写其他爬虫文件时, 直接调用, 就可以略过输入请求头, post 传参转码等诸多操作.
来源: http://www.bubuko.com/infodetail-2726612.html