python爬虫

from html.parser import HTMLParser from urllib.request import urlopen from urllib import parse#We are going to create a class called LinkParser that inherits some#methods from HTMLParser which is why it is passed into the definition class LinkParser(HTMLParser) : #This is a
function that HTMLParser normally has#but we are adding some functionality to it def handle_starttag(self, tag, attrs) : #We are looking
for the begining of a link.Links normally look#like < a href = "www.someurl.com" > </a>
        if tag == 'a':
            for (key, value) in attrs:
                if key == 'href':
                    # We are grabbing the new URL. We are also adding the
                    # base URL to it. For example:
                    # www.netinstructions.com is the base and
                    # somepage.html is the new URL (a relative URL)
                    #
                    # We combine a relative URL with the base URL to create
                    # an absolute URL like:
                    # www.netinstructions.com/somepage.html newUrl = parse.urljoin(self.baseUrl, value)#And add it to our colection of links: self.links = self.links + [newUrl]#This is a new
function that we are creating to get links#that our spider() function will call def getLinks(self, url) : self.links = []#Remember the base URL which will be important when creating#absolute URLs self.baseUrl = url#Use the urlopen
function from the standard Python 3 library response = urlopen(url)#Make sure that we are looking at HTML and not other things that#are floating around on the internet(such as#JavaScript files, CSS, or.PDFs
for example) if response.getheader('Content - Type') == 'text / html': htmlBytes = response.read()#Note that feed() handles Strings well,
but not bytes# (A change from Python 2.x to Python 3.x) htmlString = htmlBytes.decode("utf-8") self.feed(htmlString) return htmlString,
self.links
else: return "",
[]#And
finally here is our spider.It takes in an URL,
a word to find,
#and the number of pages to search through before giving up def spider(url, word, maxPages) : pagesToVisit = [url] numberVisited = 0 foundWord = False#The main loop.Create a LinkParser and get all the links on the page.#Also search the page
for the word or string#In our getLinks
function we
return the web page# (this is useful
for searching
for the word)#and we
return a set of links from that web page# (this is useful
for where to go next) while numberVisited < maxPages and pagesToVisit != [] and not foundWord: numberVisited = numberVisited + 1#Start from the beginning of our collection of pages to visit: url = pagesToVisit[0] pagesToVisit = pagesToVisit[1 : ]
try: print(numberVisited, "Visiting:", url) parser = LinkParser() data,
links = parser.getLinks(url) if data.find(word) > -1 : foundWord = True#Add the pages that we visited to the end of our collection#of pages to visit: pagesToVisit = pagesToVisit + links print(" **Success!**") except: print(" **Failed!**") if foundWord: print("The word", word, "was found at", url)
else: print("Word never found")
来源: http://www.bubuko.com/infodetail-1949127.html
与本文相关文章

暂无,快来抢沙发吧！