一个抓取京东手机列表url的小程序

 
#-*- coding: UTF-8 -*-
import urllib2
import bs4
import time
 
def getPage(urlStr):
    '''
                获取页面内容
    '''
    content = urllib2.urlopen(urlStr).read()
    return content
 
def getNextPageUrl(currPageNum):
    #http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-页码-1-1-72-4137-33.html
    url =  u'http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-'+str(currPageNum+1)+'-1-1-72-4137-33.html'
     
    #是否有下一页
    content = getPage(url);
    soup = bs4.BeautifulSoup(content)
    list = soup.findAll('span',{'class':'next-disabled'});
    if(len(list) == 0):
        return url
    return ''
     
def analyzeList():
    pageNum = 0
    list = []
    url = getNextPageUrl(pageNum)
    while url !='':
        soup = bs4.BeautifulSoup(getPage(url))
        pagelist = soup.findAll('div',{'class':'p-name'})
        for elem in pagelist:
            soup1 =  bs4.BeautifulSoup(str(elem))
            list.append(soup1.find('a')['href'])
         
        pageNum = pageNum+1
        print pageNum
        url = getNextPageUrl(pageNum)
    return list
 
def analyzeContent(url):
     
    return ''
 
def writeToFile(list, path):
    f = open(path, 'a')
    for elem in list:
        f.write(elem+'\\n')
    f.close()
 
if __name__ == '__main__':
    list = analyzeList()
    print '共抓取'+str(len(list))+'条\\n'
      
    writeToFile(list, u'E:\\\\jd_phone_list.dat');
#该片段来自于http://www.codesnippet.cn/detail/101220137914.html

来源: http://www.codesnippet.cn/detail/101220137914.html

与本文相关文章

暂无,快来抢沙发吧！