- #导入模块
- from bs4 import BeautifulSoup
- import urllib.parse
- import urllib.request
- import openpyxl
- import re
- #excel文件
- work_book = openpyxl.load_workbook("./test.xlsx")
- work_sheet = work_book.get_sheet_by_name("aaa")
- work_shee=work_book.active
- #上网变量
- url="http://www.seedmm.com"
- req = urllib.request.Request(url)
- req.add_header("user-agent","Mozilla/5.0")
- response1=urllib.request.urlopen(req)
- html_doc=response1.read()
- soup = BeautifulSoup(html_doc, 'html.parser')
- linkli=soup.findAll('a',href=re.compile(r"page"))
- #主程序
- n=1
- for link1 in linkli:
- herfurl="http://www.seedmm.com"+link1['href']
- url=herfurl
- req = urllib.request.Request(url)
- req.add_header("user-agent","Mozilla/5.0")
- response1=urllib.request.urlopen(req)
- html_doc=response1.read()
- soup = BeautifulSoup(html_doc, 'html.parser')
- links=soup.findAll('a',class_="movie-box")
- print (url)
- for link in links:
- at1="a"+str(n)
- bt1="b"+str(n)
- work_shee[at1] = link['href']
- work_shee[bt1] = link.get_text()
- n=n+1
- work_book.save("test2.xlsx")
- # print (link.name,link['href'],link.get_text())
- #该片段来自于http://www.codesnippet.cn/detail/0508201614907.html
来源: http://www.codesnippet.cn/detail/0508201614907.html