python 爬取哦漫画

import requests
 from lxml import etree
 from bs4 import BeautifulSoup
 import os
 from selenium import webdriver
 #解析每个漫画分页并下载漫画
 def manhua(url):
     browser.get(url)
     #获取模拟访问的页面源码
     html=browser.page_source
     html = etree.HTML(html)
     img_url = html.xpath('//img[@id="mangaFile"]/@src')[0]
     alt = html.xpath('/html/body/div[2]/div[2]/h1/a/text()')[0]
     title = html.xpath('/html/body/div[2]/div[2]/h2/text()')[0]
     print(img_url,alt,title)
     # print(html)
     path='./ 漫画 /'+alt+'/'+title+'/'
     if not os.path.exists(path):
         os.makedirs(path)
     fname=img_url.split('/')[-1]
     # print(fname)
     print(os.path.join(path,fname))
     # request.urlretrieve(img_url,os.path.join(path,fname))
     #请求图片地址
     response = requests.get(img_url)
     #二进制解码
     data= response.content
     #保存文件
     with open(path+fname,'wb') as f:
         f.write(data)
 #解析获取漫画分页链接
 def manhua_url(url):
     response = requests.get(url)
     response.encoding = response.apparent_encoding
     html = response.text
     html = etree.HTML(html)
     # print(html)
     #i 为漫画页数
     i = html.xpath('/html/body/div[2]/div[2]/span/text()')[1][1:-1]
     i=int(i)
     # print(i)
     #找到分页规律
     #拼接分页链接, 选择用 format 函数
     url = url +'/index.html?p={}'
     # print(url)
     for n in range(1,i+1):
         fullurl = url.format(n)
         print(fullurl)
         # time.sleep(2)
         #fullurl 为所有的分页漫画链接
         manhua(fullurl)
 #解析列表页
 def list(lb_url):
     response = requests.get(lb_url)
     response.encoding = response.apparent_encoding
     html = response.text
     html = BeautifulSoup(html,'lxml')
     #匹配所有章节链接
     url_list = html.select('div.subBookList ul li')
     for url in url_list :
         url = url.select('a')[0].get('href').split('/')[-2]
         # print(url)
         fullurl = os.path.join(lb_url,url)
         print(fullurl)
         #章节链接
         manhua_url(fullurl)
     # print(url_list)
     # print(html)
 #解析首页
 def shouye():
     #首页链接
     base_url = 'http://www.omanhua.com/'
     #发起请求
     response = requests.get(base_url)
     #解码
     response.encoding = response.apparent_encoding
     #获取返回的网页
     html = response.text
     # print(html)
     #解析
     html =BeautifulSoup(html,'lxml')
     #匹配最热漫画链接
     url_list = html.select('ul#cartoon_image_show1 li')
     for url in url_list:
         # print(url)
         url = url.select('a')[0].get('href')[1:]
         # alt = url.select('a')
         # print(alt)
         #拼接链接
         fullurl = os.path.join(base_url,url)
         print(fullurl)
         list(fullurl)
 if __name__ == '__main__':
     # 用自动测试模块 selenium 模拟浏览器访问, 这里用谷歌 图片加载获取不到图片链接
     #后面的路径是 chorm 驱动路径
     browser = webdriver.Chrome(executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe')
     shouye()
刚开始自学爬虫不久, 代码可能写的有点繁琐, 希望和大家一起学习学习进步
来源: https://www.cnblogs.com/lyxdw/p/9226583.html
与本文相关文章

暂无,快来抢沙发吧！