Python 爬虫, 使用 BeautifulSoup 可以轻松解析页面结果, 下面是使用该方法爬取 boss 页面的职位信息: 包括职位名称, 薪资, 地点, 公司名称, 公司融资情况等信息. 通过这个示例可以轻松看到 BeautifulSoup 的使用方法.
1, 爬取 boss 直聘的职位信息
- import requests
- from bs4 import BeautifulSoup
- from middlewares import get_random_proxy,get_random_agent
- import time
- class Boss_Spider(object):
- def init(self, page=3):
- self.proxies = []
- self.verify_pro = []
- self.page = page
- self.headers = {}
- # 第一步: 获取首页所有招聘连接
- def Parse_pre(self):
- base_url = 'https://www.zhipin.com/'
- headers = get_random_agent()
- proxy = get_random_proxy()
- time.sleep(1)
- resp = requests.get(base_url, headers=headers)
- if resp.status_code == 200:
- soup = BeautifulSoup(resp.text, 'lxml')
- for job_menu in soup.find_all(class_='menu-sub'):
- for li in job_menu.find_all('li'):
- job_type = li.find('h4').get_text()
- for job_list in li.find_all('a'):
- job_sub = job_list.get_text()
- job_uri = job_list['href']
- for i in range(0,11):
- job_url = base_url + job_uri + '?page=%d&ka=page-%d' %(i,i)
- requests.get(job_url,headers=headers,proxies=proxy)
- meta = {
- 'job_type': job_type,
- 'job_sub': job_sub,
- }
- self.Parse_index(meta=meta,url=job_url)
- # 爬取具体页数据
- def Parse_index(self,meta,url):
- headers = get_random_agent()
- proxy = get_random_proxy()
- time.sleep(1)
- resp = requests.get(url, headers=headers)
- if resp.status_code == 200:
- soup = BeautifulSoup(resp.text, 'lxml')
- print(soup)
- for li in soup.find(class_='job-list').find_all('li'):
- print('###########')
- position = li.find(class_='job-title').get_text()
- salary = li.find(class_='red').get_text()
- add = li.find('p').get_text()
- need = li.find('p').find('em').get_text()
- company_name = li.find(class_='company-text').find('a').get_text()
- tag = li.find(class_='company-text').find('p')
- print(position,"$$$",salary,"$$$",add,"$$$",need,"$$$",company_name,"$$$",tag)
- if name == 'main':
- b = Boss_Spider()
- b.Parse_pre()
运行输出结果如下:
后端开发 $$$ 15-30K $$$ 北京 朝阳区 朝外 3-5 年本科 $$$ $$$ 米花互动 $$$ 游戏不需要融资 20-99 人
###########
后端开发工程师 $$$ 35-55K $$$ 北京 朝阳区 望京经验不限本科 $$$ $$$ 云账户 $$$ 移动互联网 C 轮 100-499 人
###########
2, 爬取豆瓣网图书前 250 信息
- import requests
- from bs4 import BeautifulSoup
发出请求获得 html 源码的函数
def get_html(url):
伪装成浏览器访问
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ApplewebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
- resp = requests.get(url, headers=headers).text
- return resp
解析页面, 获得数据信息
- def html_parse():
- i = 1
调用函数, for 循环迭代出所有页面
- for url in all_page():
- # BeautifulSoup 的解析
- soup = BeautifulSoup(get_html(url), 'lxml')
- # 书名
- alldiv = soup.find_all('div', class_='pl2')
- names = [a.find('a')['title'] for a in alldiv]
- # 作者
- allp = soup.find_all('p', class_='pl')
- authors = [p.get_text() for p in allp]
- # 评分
- starspan = soup.find_all('span', class_='rating_nums')
- scores = [s.get_text() for s in starspan]
- # 简介
- sumspan = soup.find_all('span', class_='inq')
- sums = [i.get_text() for i in sumspan]
- for name, author, score, sum in zip(names, authors, scores, sums):
- name = '书名:' + str(name) + '\n'
- author = '作者:' + str(author) + '\n'
- score = '评分:' + str(score) + '\n'
- sum = '简介:' + str(sum) + '\n'
- data = str(i) + '\n' + name + author + score + sum
- i = i + 1
- # 保存数据
- f.writelines(data + '=======================' + '\n')
获得所有页面的函数
- def allpage():
- url = 'https://book.douban.com/top250?start=200'
- urllist = []
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
- }
- resp = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
- i = 1
- for h in resp.find('div', class='paginator').find_all('a'):
- if i> 1:
- urllist.append(h['href'])
- i = i + 1
- urllist.append(url)
- print(urllist)
- return urllist
文件名
filename = '豆瓣图书 Top250.txt'
保存文件操作
f = open(filename, 'w', encoding='utf-8')
调用函数
- html_parse()
- f.close()
- print('保存成功.')
来源: http://www.bubuko.com/infodetail-3125631.html