python_爬虫_模块

import pymysql
from urllib import request,parse
from urllib.error import HTTPError,URLError
def main(url,headers=None,data=None): # 调用函数
    if not data:
        return get_response(url,headers=headers)
    else:
        return get_response(url,headers=headers,data=data)
def get_response(url,data=None,headers=None):
    if not headers:
        headers = {'User-Agent':get_agent()}
    try:
        if data:
            data = parse.urlencode(data)
            data = bytes(data,encoding='utf-8')
            req = request.Request(url, data=data, headers=headers)
        else:
            req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        data = response.read().decode()
        return data # 返回数据
    except HTTPError as e: # 总的错误信息, 不适合用于调试
        print(e)
    except URLError as e:
        print(e)
def get_agent(table=None): # 提前使用 fake_useragent 模块生成的请求头, 存储在数据库中, 避免出现问题无法调用 fake_useragent 模块
    table = 'p_useragent'
    conn = pymysql.connect('127.0.0.1', 'root', '123456', 'PaChong', charset='utf8')
    cursor = conn.cursor() # 连接数据库, 随机调用请求头
    sql = 'SELECT * FROM {} WHERE id>= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM p_useragent)  LIMIT 1'.format(
        table, table, table)
    rwo = cursor.execute(sql)
    useragent = cursor.fetchall()[0][1]
    return useragent
if __name__ == '__main__':
    url = 'http://fanyi.baidu.com/sug'
    data = {'kw':'中国'}
    import json
    res = json.loads(main(url,data=data))
    print(res)
    # url = 'http://www.baidu.com'
    # res = main(url)
    # print(res)

正常情况下, 每写一个爬虫, 都需要执行分析 ->请求 ->响应 ->下载 (存储) 的流程, 但诸多功能, 其实都是在重复造轮子, 比如请求, 调用请求头,post 请求 data 值, 可以将这些功能写到一个 py 文件里, 这样再写其他爬虫文件时, 直接调用, 就可以略过输入请求头, post 传参转码等诸多操作.

来源: http://www.bubuko.com/infodetail-2726612.html

与本文相关文章

暂无,快来抢沙发吧！