- # coding=utf-8
- #@auther:Mana_菜小刀
- import requests
- import queue
- import threading
- import xlrd
- import xlwt
- from lxml import etree
- from xlutils.copy import copy
- from requests.packages.urllib3.exceptions import InsecureRequestWarning
- requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
- }
- myxls = xlwt.Workbook()
- sheet1 = myxls.add_sheet('收录 search')
- lst_name = ['url', '收录 / 未收录', '图片']
- for i in range(len(lst_name)):
- sheet1.write(0, i, lst_name[i])
- myxls.save('result.xls')
- def log(*args,**kwargs):
- print(*args,**kwargs)
- class baiduSpider(threading.Thread):
- def __init__(self, queue_li, name):
- threading.Thread.__init__(self)
- self._queue = queue_li
- self._name = name
- def run(self):
- while not self._queue.empty():
- url = self._queue.get()
- try:
- self.get_url(url)
- except Exception as e:
- log(e)
- pass
- def get_url(self,url):
- requests.adapters.DEFAULT_RETRIES = 5
- r = requests.session()
- r.keep_alive = False
- s = r.get(url=url, headers=headers)
- #log(s)
- xpather = etree.HTML(s.text)
- strs = xpather.xpath('//span[@class="nums_text"]//text()')
- imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src')
- #log(strs, imgs)
- search_mo = ['收录','未收录']
- img_mo = ['有图','无图']
- url_mo = url.replace('http://www.baidu.com/s?wd=','')
- workbook = xlrd.open_workbook('result.xls', formatting_info=True)
- sheet = workbook.sheet_by_index(0)
- rowNum = sheet.nrows
- colNum = sheet.ncols
- newbook = copy(workbook)
- newsheet = newbook.get_sheet(0)
- if strs[0] != "百度为您找到相关结果约 0 个" and len(imgs)> 0:
- newsheet.write(rowNum,0,url_mo)
- newsheet.write(rowNum, 1, search_mo[0])
- newsheet.write(rowNum, 2, img_mo[0])
- log(search_mo[0],'丨',img_mo[0],'丨',url_mo)
- #newbook.save('result.xls')
- elif strs[0] != "百度为您找到相关结果约 0 个" and len(imgs) == 0:
- newsheet.write(rowNum, 0, url_mo)
- newsheet.write(rowNum, 1, search_mo[0])
- newsheet.write(rowNum, 2, img_mo[1])
- log(search_mo[0],'丨',img_mo[1],'丨',url_mo)
- #newbook.save('result.xls')
- else:
- newsheet.write(rowNum, 0, url_mo)
- newsheet.write(rowNum, 1, search_mo[1])
- newsheet.write(rowNum, 2, img_mo[1])
- log(search_mo[1],'丨',img_mo[1],'丨',url_mo)
- newbook.save('result.xls')
- def main():
- queue_li = queue.Queue()
- threads = []
- thread_count = 10
- myxls = xlwt.Workbook()
- sheet1 = myxls.add_sheet('IDF')
- '''把'urls'改成自己的 txt 文档名称:'''
- with open('urls', 'r', encoding='utf-8', errors="ignore") as f:
- content = f.read()
- urls = content.split('\n')
- for url in urls:
- if len(url)> 0:
- url_search = url
- queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search))
- for i in range(thread_count):
- spider = baiduSpider(queue_li, url_search)
- threads.append(spider)
- for i in threads:
- i.start()
- for i in threads:
- i.join()
- '''log("Mana 好伟大!(^-^)V")'''
- if __name__ == '__main__':
- log("Mana 好伟大!(^-^)V")
- main()
来源: http://www.bubuko.com/infodetail-3123479.html