解析库的安装
pip3 install beautifulsoup4
初始化 BeautifulSoup(str,"解析库")
- from bs4 import BeautifulSoup
- html='''<div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list"id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small"id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div></div>'''
- soup = BeautifulSoup(html,"lxml") # soup = BeautifulSoup(html,"html.parser")
- 标签选择器
- 选择元素 soup.E
- html = """<html><head><title>The Dormouse's story</title></head>
- <body>
- <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
- <p class="story">Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- and they lived at the bottom of a well.</p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.title)
- print(type(soup.title))
- print(soup.head)
- print(soup.p)
- """
- 打印结果:
- <title>The Dormouse's story</title>
- <class 'bs4.element.Tag'>
- <head><title>The Dormouse's story</title></head>
- <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
- """
- 选择元素
- 获取名称 soup.E.name
- html = """<html><head><title>The Dormouse's story</title></head>
- <body>
- <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
- <p class="story">Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- and they lived at the bottom of a well.</p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.title.name) # title
- 获取名称
- 获取属性 soup.E.attrs[ ] or soup.E[ ]
- html = """<html><head><title>The Dormouse's story</title></head>
- <body>
- <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
- <p class="story">Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- and they lived at the bottom of a well.</p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.p.attrs['name'])
- print(soup.p['name'])
- """
- dromouse
- dromouse
- """
- 获取属性
- 获取内容 soup.E.string
- html = """<html><head><title>The Dormouse's story</title></head>
- <body>
- <p clss="title" name="dromouse"><b>The Dormouse's story</b></p>
- <p class="story">Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- and they lived at the bottom of a well.</p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.p.string)
- """The Dormouse's story
- """
- 获取内容
- 嵌套选择 soup.E.E
- html = """<html><head><title>The Dormouse's story</title></head>
- <body>
- <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
- <p class="story">Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- and they lived at the bottom of a well.</p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.head.title.string)
- """The Dormouse's story
- """
- View Code
- 子节点 soup.E.contents
- html = """
- <html>
- <head>
- <title>The Dormouse's story</title>
- </head>
- <body>
- <p class="story">
- Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1">
- <span>Elsie</span>
- </a>
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
- and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
- and they lived at the bottom of a well.
- </p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.p.contents)
- """ ['\n Once upon a time there were three little sisters; and their names were\n ', <a class="sister"href="http://example.com/elsie"id="link1">
- <span>Elsie</span>
- </a>, '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, '\n and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, '\n and they lived at the bottom of a well.\n']
- """
- View Code
- 子节点 soup.E.children
- html = """
- <html>
- <head>
- <title>The Dormouse's story</title>
- </head>
- <body>
- <p class="story">
- Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1">
- <span>Elsie</span>
- </a>
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
- and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
- and they lived at the bottom of a well.
- </p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.p.children)
- for i, child in enumerate(soup.p.children):
- print(i, child)
- """
- <list_iterator object at 0x00B116D0>
- 0
- Once upon a time there were three little sisters; and their names were
- 1 <a class="sister" href="http://example.com/elsie" id="link1">
- <span>Elsie</span>
- </a>
- 2
- 3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
- 4
- and
- 5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
- 6
- and they lived at the bottom of a well.
- """
- View Code
- 子孙节点 soup.E.descendants 包括标签里面的文本都属于
- html = """
- <html>
- <head>
- <title>The Dormouse's story</title>
- </head>
- <body>
- <p class="story">
- Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1">
- <span>Elsie</span>
- </a>
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
- and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
- and they lived at the bottom of a well.
- </p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.p.descendants)
- for i, child in enumerate(soup.p.descendants):
- print(i, child)
- """
- <generator object descendants at 0x03EBD420>
- 0
- Once upon a time there were three little sisters; and their names were
- 1 <a class="sister" href="http://example.com/elsie" id="link1">
- <span>Elsie</span>
- </a>
- 2
- 3 <span>Elsie</span>
- 4 Elsie
- 5
- 6
- 7 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
- 8 Lacie
- 9
- and
- 10 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
- 11 Tillie
- 12
- and they lived at the bottom of a well.
- """
- View Code
- 父节点 soup.E.parent
- html = """
- <html>
- <head>
- <title>The Dormouse's story</title>
- </head>
- <body>
- <p class="story">
- Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1">
- <span>Elsie</span>
- </a>
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
- and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
- and they lived at the bottom of a well.
- </p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.a.parent)
- View Code
- 祖先节点 soup.E.parents
- html = """
- <html>
- <head>
- <title>The Dormouse's story</title>
- </head>
- <body>
- <p class="story">
- Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1">
- <span>Elsie</span>
- </a>
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
- and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
- and they lived at the bottom of a well.
- </p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(list(enumerate(soup.a.parents)))
- View Code
- 兄弟节点 soup.E.next_siblings soup.E.previous_siblings
- html = """
- <html>
- <head>
- <title>The Dormouse's story</title>
- </head>
- <body>
- <p class="story">
- Once upon a time there were three little sisters; and their names were
- <a href="http://example.com/elsie" class="sister" id="link1">
- <span>Elsie</span>
- </a>
- <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
- and
- <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
- and they lived at the bottom of a well.
- </p>
- <p class="story">...</p>
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(list(enumerate(soup.a.next_siblings)))
- print(list(enumerate(soup.a.previous_siblings)))
- View Code
- 标准选择器
- find_all( name , attrs , recursive , text , **kwargs )
- 可根据标签名, 属性, 内容查找文档
- 标签名获取 soup.find_all('name')
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.find_all('ul'))
- View Code
- 属性获取 soup.find_all(attrs={})
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1" name="elements">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.find_all(attrs={'id': 'list-1'}))
- print(soup.find_all(attrs={'name': 'elements'}))
- View Code
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.find_all(id='list-1'))
- print(soup.find_all(class_='element'))
- 第二种
- 文本内容获取
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.find_all(text='Foo'))
- View Code
- find( name , attrs , recursive , text , **kwargs )
- find 返回单个元素, find_all 返回所有元素
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.find('ul'))
- print(type(soup.find('ul')))
- print(soup.find('page'))
- find_parents() find_parent()
- find_parents() 返回所有祖先节点, find_parent() 返回直接父节点.
- find_next_siblings() find_next_sibling()
- find_next_siblings() 返回后面所有兄弟节点, find_next_sibling() 返回后面第一个兄弟节点.
- find_previous_siblings() find_previous_sibling()
- find_previous_siblings() 返回前面所有兄弟节点, find_previous_sibling() 返回前面第一个兄弟节点.
- find_all_next() find_next()
- find_all_next() 返回节点后所有符合条件的节点, find_next() 返回第一个符合条件的节点
- find_all_previous() 和 find_previous()
- find_all_previous() 返回节点后所有符合条件的节点, find_previous() 返回第一个符合条件的节点
- CSS 选择器
- 通过 select() 直接传入 CSS 选择器即可完成选择 soup.select(css 选择器)
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- print(soup.select('.panel .panel-heading'))
- print(soup.select('ul li'))
- print(soup.select('#list-2 .element'))
- print(type(soup.select('ul')[0]))
- View Code
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- for ul in soup.select('ul'):
- print(ul.select('li'))
- View Code
- 获取属性 E.attrs[] or E[]
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- for ul in soup.select('ul'):
- print(ul['id'])
- print(ul.attrs['id'])
- View Code
- 获取内容 get_text()
- html='''<div class="panel">
- <div class="panel-heading">
- <h4>Hello</h4>
- </div>
- <div class="panel-body">
- <ul class="list" id="list-1">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- <li class="element">Jay</li>
- </ul>
- <ul class="list list-small" id="list-2">
- <li class="element">Foo</li>
- <li class="element">Bar</li>
- </ul>
- </div>
- </div>
- '''
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- for li in soup.select('li'):
- print(li.get_text())
- View Code
总结
推荐使用 lxml 解析库, 必要时使用 html.parser
标签选择筛选功能弱但是速度快
建议使用 find(),find_all() 查询匹配单个结果或者多个结果
如果对 CSS 选择器熟悉建议使用 select()
记住常用的获取属性和文本值的方法
本文代码皆来自崔庆才Python3 网络爬虫开发实战
来源: http://www.bubuko.com/infodetail-2744121.html