python 爬取百度url

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # @Date    : 2017-08-29 18:38:23
 # @Author  : EnderZhou ([email protected])
 # @Link    : http://www.cnblogs.com/enderzhou/
 # @Version : $Id$

 import requests
 import sys
 from Queue import Queue
 import threading
 from bs4 import BeautifulSoup as bs
 import re

 # 默认爬取百度76页搜索结果url，调用格式 Python.exe 本文件名称.py 搜索关键字，如关键字含特殊符号使用引号包含起来。
 # 爬取结果有txt文档输出。目前尚未能过来百度推广链接，后续有可能会完善。另外后续将会添加同一网站相同路径不通参数url的过滤。
 # https://www.baidu.com/s?wd=ichunqiu&pn=10
 # wd参数为搜索内容关键字 pn参数控制页码 第二页为10 每页新增10 最大页数参数为750即76页。

 headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/60.0.3112.78 Safari/537.36‘,}

 class BaiduSpider(threading.Thread):
     def __init__(self,queue):
         threading.Thread.__init__(self)
         self._queue = queue

     def run(self):
         while not self._queue.empty():
              url = self._queue.get()
              try:
                  self.spider(url)
              except Exception as e:
                  # print e
                  pass

     def spider(self,url):
         r = requests.get(url=url,headers=headers)
         soup = bs(r.content,‘html.parser‘)
         urllist = soup.find_all(name=‘a‘,attrs={‘data-click‘:re.compile((‘.‘)),‘class‘:None,‘data-is-main-url‘:None})
         for i in urllist:
             l = requests.get(url=i[‘href‘],headers=headers)
             if l.status_code == 200:
                 ll = l.url.split(‘/‘)
                 lll = ll[0]+‘//‘+ll[2]+‘\n‘
                 #可根据需求修改是否显示主域名
                 sys.stdout.write(lll+l.url+‘\n‘)
                 f1 = open(‘out_para.txt‘,‘a+‘)
                 f1.write(l.url+‘\n‘)
                 f1.close()
                 with open(‘out_index.txt‘) as f:
                     if lll not in f.read():
                         f2 = open(‘out_index.txt‘,‘a+‘)
                         f2.write(lll)
                         f2.close()

 def main(keyword):
     queue = Queue()
     for i in range(0,760,10):
         l = ‘https://www.baidu.com/s?wd=‘+keyword+‘&pn=‘+str(i)
         # print l
         queue.put(l)
     threads = []
     thread_count = 5
     for i in range(thread_count):
         threads.append(BaiduSpider(queue))
     for t in threads:
         t.start()
     for t in threads:
         t.join()

 if __name__ == ‘__main__‘:
     if len(sys.argv) != 2:
         print ‘Enter:python %s keyword‘ % sys.argv[0]
         sys.exit(-1)
     else:
         f1 = open(‘out_para.txt‘,‘w‘)
         f1.close()
         f2 = open(‘out_index.txt‘,‘w‘)
         f2.close()
         main(sys.argv[1])
来源: http://www.bubuko.com/infodetail-2282806.html
与本文相关文章

暂无,快来抢沙发吧！