收录及出图导出 Excel 表

# coding=utf-8
#@auther:Mana_菜小刀
import requests
import queue
import threading
import xlrd
import xlwt
from lxml import etree
from xlutils.copy import copy
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet('收录 search')
lst_name = ['url', '收录 / 未收录', '图片']
for i in range(len(lst_name)):
    sheet1.write(0, i, lst_name[i])
myxls.save('result.xls')
def log(*args,**kwargs):
    print(*args,**kwargs)
class baiduSpider(threading.Thread):
    def __init__(self, queue_li, name):
        threading.Thread.__init__(self)
        self._queue = queue_li
        self._name = name
    def run(self):
        while not self._queue.empty():
            url = self._queue.get()
            try:
                self.get_url(url)
            except Exception as e:
                log(e)
                pass
    def get_url(self,url):
        requests.adapters.DEFAULT_RETRIES = 5
        r = requests.session()
        r.keep_alive = False
        s = r.get(url=url, headers=headers)
        #log(s)
        xpather = etree.HTML(s.text)
        strs = xpather.xpath('//span[@class="nums_text"]//text()')
        imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src')
        #log(strs, imgs)
        search_mo = ['收录','未收录']
        img_mo = ['有图','无图']
        url_mo = url.replace('http://www.baidu.com/s?wd=','')
        workbook = xlrd.open_workbook('result.xls', formatting_info=True)
        sheet = workbook.sheet_by_index(0)
        rowNum = sheet.nrows
        colNum = sheet.ncols
        newbook = copy(workbook)
        newsheet = newbook.get_sheet(0)
        if strs[0] != "百度为您找到相关结果约 0 个" and len(imgs)> 0:
            newsheet.write(rowNum,0,url_mo)
            newsheet.write(rowNum, 1, search_mo[0])
            newsheet.write(rowNum, 2, img_mo[0])
            log(search_mo[0],'丨',img_mo[0],'丨',url_mo)
            #newbook.save('result.xls')
        elif strs[0] != "百度为您找到相关结果约 0 个" and len(imgs) == 0:
            newsheet.write(rowNum, 0, url_mo)
            newsheet.write(rowNum, 1, search_mo[0])
            newsheet.write(rowNum, 2, img_mo[1])
            log(search_mo[0],'丨',img_mo[1],'丨',url_mo)
            #newbook.save('result.xls')
        else:
            newsheet.write(rowNum, 0, url_mo)
            newsheet.write(rowNum, 1, search_mo[1])
            newsheet.write(rowNum, 2, img_mo[1])
            log(search_mo[1],'丨',img_mo[1],'丨',url_mo)
        newbook.save('result.xls')
def main():
    queue_li = queue.Queue()
    threads = []
    thread_count = 10
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet('IDF')
    '''把'urls'改成自己的 txt 文档名称:'''
    with open('urls', 'r', encoding='utf-8', errors="ignore") as f:
        content = f.read()
        urls = content.split('\n')
    for url in urls:
        if len(url)> 0:
            url_search = url
            queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search))
    for i in range(thread_count):
        spider = baiduSpider(queue_li, url_search)
        threads.append(spider)
    for i in threads:
        i.start()
    for i in threads:
        i.join()
    '''log("Mana 好伟大!(＾-＾)V")'''
if __name__ == '__main__':
    log("Mana 好伟大!(＾-＾)V")
    main()
来源: http://www.bubuko.com/infodetail-3123479.html
与本文相关文章

暂无,快来抢沙发吧！