- # -*- coding: gb2312 -*-
- import urllib2
- import re
- import json
- from bs4 import BeautifulSoup
- import sqlite3
- from time import gmtime, strftime, localtime
- def getPrice(link):
- f3 = urllib2.urlopen(link)
- html3 = f3.read()
- try:
- html3.decode('gb2312')
- except:
- pass
- soup_hot = BeautifulSoup(html3, from_encoding='gb2312')
- tag_hot = soup_hot("p", "promoText")
- for tag in tag_hot:
- detail_link = tag.a['href']
- print detail_link
- f4 = urllib2.urlopen(detail_link)
- html4 = f4.read()
- ## try:
- ## html4.decode('gb2312')
- ## except:
- ## pass
- soup_detail = BeautifulSoup(html4, from_encoding='gb2312')
- for tag in soup_detail("p","promoText"):
- print tag.text
- for tag2 in soup_detail("h1"):
- print tag2.text
- f5 = urllib2.urlopen(detail_link)
- pattern_price = re.compile("pvalues(.*)")
- for line in f5.readlines():
- m = re.search('pvalues:(.*)', line)
- if m is not None:
- return m.group(1)
- else:
- continue
- if __name__ == '__main__':
- ## proxyhandler = urllib2.ProxyHandler({'http':'http://proxy.xxxx.com'})
- ## opener = urllib2.build_opener(proxyhandler)
- ## urllib2.install_opener(opener)
- conn = sqlite3.connect("items.db")
- cur = conn.cursor()
- current_time_x = localtime()
- current_date = strftime("%Y/%m/%d", current_time_x)
- current_time = strftime("%H:%M", current_time_x)
- print 'jd:', '*'*60
- request = urllib2.Request('http://www.jd.com')
- request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')
- f = urllib2.urlopen(request)
- html = f.read()
- try:
- html.decode('gb18030')
- except:
- pass
- soup = BeautifulSoup(html,from_encoding='gb18030')
- myContentList = soup("li", onclick=re.compile('fengkuang2012'))
- print myContentList
- print "\\n"*5
- for tag in myContentList:
- print tag
- print "\\n"*2
- print '物品:',tag.contents[3].contents[0].text
- print '价格:',tag.contents[5].contents[1].text
- print '链接:',tag.contents[1].a['href']
- print '图片:',tag.contents[1].img['src']
- print '*'*80
- print "\\n\\n", 'newegg_cn:', '*' * 60
- f2 = urllib2.urlopen('http://zhadan.newegg.com.cn')
- html2 = f2.read()
- try:
- html2.decode('gb2312')
- except:
- pass
- soup_newegg = BeautifulSoup(html2, from_encoding='gb2312')
- tag_bomb = soup_newegg("div", 'inner')
- for tag in tag_bomb:
- lis = tag.find_all('li')
- for li in lis:
- if li['class'][-1] != u'locked':
- print getPrice(li.a['href'])
- print "Img Link: ", li.img['src']
- else:
- print "Locked item: ", li.a['href']
- print "Img Link: ", li.img['src']
- print "\\n\\n", '51buy:', '>'*60
- f_51buy = urllib2.urlopen('http://www.51buy.com')
- html_51buy = f_51buy.read()
- soup_51buy = BeautifulSoup(html_51buy, from_encoding='utf8')
- tag_quickbuy = soup_51buy("div", 'bd_inner')
- for tag in tag_quickbuy:
- mylist = tag.contents
- print mylist
- print "\\n"*5
- # Today's hot list:
- print "Today's hot list, come on baby:\\n\\n"
- lis = mylist[1].find_all('li')
- for li in lis:
- print li.a['href']
- print li.a['title']
- print li.a.text
- print li.find("div", 'price').text
- print li.img['_src']
- print '\\n'
- print '\\n'*5
- print "Tomorrow hot list to be expected:\\n\\n"
- # Not yet started -- For tomorrow's hot list, no price information
- lis_tomorrow = mylist[3].find_all('li')
- for li_t in lis_tomorrow:
- print li_t.a['href']
- print li_t.a['title']
- print li_t.a.text
- print li_t.find("div", 'wait').text
- print li_t.img['_src']
- print '\\n'
- #该片段来自于http://www.codesnippet.cn/detail/081020136276.html
来源: http://www.codesnippet.cn/detail/081020136276.html