python 之 BeautifulSoup标签查找与信息提取

>>> for x in soup.find_all('a'):
    print(x)
    
class="sister" href="http://example.com/elsie" id="link1">Elsie
class="sister" href="http://example.com/lacie" id="link2">Lacie
class="sister" href="http://example.com/tillie" id="link3">Tillie

>>> for x in soup.find_all('a',href = re.compile('lacie')):
    print(x)
class="sister" href="http://example.com/lacie" id="link2">Lacie

>>> for x in soup.find_all('a',string = re.compile('Elsie')):
    print(x)
    
class="sister" href="http://example.com/elsie" id="link1">Elsie

>>> for x in soup.find('body').children:
    if isinstance(x,bs4.element.Tag):        #使用isinstance过滤掉空行内容
        print(x)
        
class="title">The Dormouse's story</b></p>
class="story">Once upon a time there were three little sisters; and their names were
class="sister" href="http://example.com/elsie" id="link1">Elsie,
class="sister" href="http://example.com/lacie" id="link2">Lacie and
class="sister" href="http://example.com/tillie" id="link3">Tillie;
and they lived at the bottom of a well.

二、信息提取（链接提取）

>>> linklist = []
>>> for x in soup.find_all('a'):
    link = x.get('href')
    if link:
        linklist.append(link)
       
>>> for x in linklist:        #验证：环打印出linklist列表中的链接
    print(x)
  
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie

>>> linklst = []
>>> for x in soup.find_all('a', href = re.compile('elsie')):
    link = x.get('href')
    if link:
        linklst.append(link)
    
>>> for x in linklst:        #验证：循环打印出linklist列表中的链接
    print(x)
    
http://example.com/elsie

>>> for x in soup.find_all('a'):
    string = x.get_text()
    print(string)
   
Elsie
Lacie
Tillie

来源: http://www.bubuko.com/infodetail-2005536.html

与本文相关文章

暂无,快来抢沙发吧！