抓取三大电商今日特价

 
# -*- coding: gb2312 -*-
import urllib2
import re
import json
from bs4 import BeautifulSoup
import sqlite3
from time import gmtime, strftime, localtime
 
 
def getPrice(link):   
    f3 = urllib2.urlopen(link)
    html3 = f3.read()
    try:
        html3.decode('gb2312')
    except:
        pass
    soup_hot = BeautifulSoup(html3, from_encoding='gb2312')
    tag_hot = soup_hot("p", "promoText")
     
    for tag in tag_hot:
        detail_link = tag.a['href']
        print detail_link
        f4 = urllib2.urlopen(detail_link)
   
        html4 = f4.read()
##        try:
##            html4.decode('gb2312')
##        except:
##            pass
        soup_detail = BeautifulSoup(html4, from_encoding='gb2312')
        for tag in soup_detail("p","promoText"):
            print tag.text
        for tag2 in soup_detail("h1"):
            print tag2.text
         
        f5 = urllib2.urlopen(detail_link)
        pattern_price = re.compile("pvalues(.*)")
        for line in f5.readlines():
            m = re.search('pvalues:(.*)', line)
            if m is not None:
                return m.group(1)
            else:
                continue
             
         
         
 
 
if __name__ == '__main__':
##    proxyhandler = urllib2.ProxyHandler({'http':'http://proxy.xxxx.com'})
##    opener = urllib2.build_opener(proxyhandler)
##    urllib2.install_opener(opener)
 
    conn = sqlite3.connect("items.db")
    cur = conn.cursor()
    current_time_x = localtime()
    current_date = strftime("%Y/%m/%d", current_time_x)
    current_time = strftime("%H:%M", current_time_x)
     
     
    print 'jd:', '*'*60
    request = urllib2.Request('http://www.jd.com')
    request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')
    f = urllib2.urlopen(request)
    html = f.read()
    try:
        html.decode('gb18030')
    except:
        pass
     
    soup = BeautifulSoup(html,from_encoding='gb18030')
    myContentList = soup("li", onclick=re.compile('fengkuang2012'))
    print myContentList
    print "\\n"*5
     
    for tag in myContentList:
        print tag
        print "\\n"*2
 
        print '物品:',tag.contents[3].contents[0].text
        print '价格:',tag.contents[5].contents[1].text
        print '链接:',tag.contents[1].a['href']
        print '图片:',tag.contents[1].img['src']
        print '*'*80
 
 
    print "\\n\\n", 'newegg_cn:',  '*' * 60
    f2 = urllib2.urlopen('http://zhadan.newegg.com.cn')
    html2 = f2.read()
    try:
        html2.decode('gb2312')
    except:
        pass
    soup_newegg = BeautifulSoup(html2, from_encoding='gb2312')
 
    tag_bomb = soup_newegg("div", 'inner')
    for tag in tag_bomb:
        lis = tag.find_all('li')
        for li in lis:
            if li['class'][-1] != u'locked':
                print getPrice(li.a['href'])
                print "Img Link: ", li.img['src']
            else:
                print "Locked item: ", li.a['href']
                print "Img Link: ", li.img['src']
 
 
 
 
    print "\\n\\n", '51buy:', '>'*60
    f_51buy = urllib2.urlopen('http://www.51buy.com')
    html_51buy = f_51buy.read()
    soup_51buy = BeautifulSoup(html_51buy, from_encoding='utf8')
    tag_quickbuy = soup_51buy("div", 'bd_inner')
 
    for tag in tag_quickbuy:
        mylist = tag.contents
        print mylist
        print "\\n"*5
 
# Today's hot list:
    print "Today's hot list, come on baby:\\n\\n"
    lis = mylist[1].find_all('li')
    for li in lis:
        print li.a['href']
        print li.a['title']
        print li.a.text
        print li.find("div", 'price').text
        print li.img['_src']
        print '\\n'
    print '\\n'*5
 
    print "Tomorrow hot list to be expected:\\n\\n"
 
     
# Not yet started -- For tomorrow's hot list, no price information
    lis_tomorrow = mylist[3].find_all('li')
    for li_t in lis_tomorrow:
        print li_t.a['href']
        print li_t.a['title']
        print li_t.a.text
        print li_t.find("div", 'wait').text
        print li_t.img['_src']
        print '\\n'
#该片段来自于http://www.codesnippet.cn/detail/081020136276.html
来源: http://www.codesnippet.cn/detail/081020136276.html
与本文相关文章

暂无,快来抢沙发吧！