- import requests
- from lxml import etree
- from bs4 import BeautifulSoup
- import os
- from selenium import webdriver
- #解析每个漫画分页并下载漫画
- def manhua(url):
- browser.get(url)
- #获取模拟访问的页面源码
- html=browser.page_source
- html = etree.HTML(html)
- img_url = html.xpath('//img[@id="mangaFile"]/@src')[0]
- alt = html.xpath('/html/body/div[2]/div[2]/h1/a/text()')[0]
- title = html.xpath('/html/body/div[2]/div[2]/h2/text()')[0]
- print(img_url,alt,title)
- # print(html)
- path='./ 漫画 /'+alt+'/'+title+'/'
- if not os.path.exists(path):
- os.makedirs(path)
- fname=img_url.split('/')[-1]
- # print(fname)
- print(os.path.join(path,fname))
- # request.urlretrieve(img_url,os.path.join(path,fname))
- #请求图片地址
- response = requests.get(img_url)
- #二进制解码
- data= response.content
- #保存文件
- with open(path+fname,'wb') as f:
- f.write(data)
- #解析获取漫画分页链接
- def manhua_url(url):
- response = requests.get(url)
- response.encoding = response.apparent_encoding
- html = response.text
- html = etree.HTML(html)
- # print(html)
- #i 为漫画页数
- i = html.xpath('/html/body/div[2]/div[2]/span/text()')[1][1:-1]
- i=int(i)
- # print(i)
- #找到分页规律
- #拼接分页链接, 选择用 format 函数
- url = url +'/index.html?p={}'
- # print(url)
- for n in range(1,i+1):
- fullurl = url.format(n)
- print(fullurl)
- # time.sleep(2)
- #fullurl 为所有的分页漫画链接
- manhua(fullurl)
- #解析列表页
- def list(lb_url):
- response = requests.get(lb_url)
- response.encoding = response.apparent_encoding
- html = response.text
- html = BeautifulSoup(html,'lxml')
- #匹配所有章节链接
- url_list = html.select('div.subBookList ul li')
- for url in url_list :
- url = url.select('a')[0].get('href').split('/')[-2]
- # print(url)
- fullurl = os.path.join(lb_url,url)
- print(fullurl)
- #章节链接
- manhua_url(fullurl)
- # print(url_list)
- # print(html)
- #解析首页
- def shouye():
- #首页链接
- base_url = 'http://www.omanhua.com/'
- #发起请求
- response = requests.get(base_url)
- #解码
- response.encoding = response.apparent_encoding
- #获取返回的网页
- html = response.text
- # print(html)
- #解析
- html =BeautifulSoup(html,'lxml')
- #匹配最热漫画链接
- url_list = html.select('ul#cartoon_image_show1 li')
- for url in url_list:
- # print(url)
- url = url.select('a')[0].get('href')[1:]
- # alt = url.select('a')
- # print(alt)
- #拼接链接
- fullurl = os.path.join(base_url,url)
- print(fullurl)
- list(fullurl)
- if __name__ == '__main__':
- # 用自动测试模块 selenium 模拟浏览器访问, 这里用谷歌 图片加载获取不到图片链接
- #后面的路径是 chorm 驱动路径
- browser = webdriver.Chrome(executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe')
- shouye()
刚开始自学爬虫不久, 代码可能写的有点繁琐, 希望和大家一起学习学习进步
来源: https://www.cnblogs.com/lyxdw/p/9226583.html