- #!/usr/bin/env python
- #coding=utf-8
- import urllib2,re
- from uliweb.orm import *
- def geturl(url):
- # h4 = u'http://www.baidu.com/'
- h4 = url
- url = re.search(r"://.[^/]+/",h4)
- if url == None:
- url = re.search(r"://.+",h4)
- print url.group()
- yuming = re.sub("""[:/ ]""",'',url.group())
- #print yuming
- if yuming == None:
- return None
- yuming = "http://" + yuming + "/"
- h4=h4.encode("utf-8")
- f = urllib2.urlopen(h4,timeout=5000)
- buf = f.read()
- #print buf
- urls = re.findall(r"<[aA].*?href.*?>",buf)
- list_jue = []
- list_xiang = []
- for n in urls:
- # print n
- url = re.search(r"=.*?[ >]",n)
- #print url.group()
- url_box = re.sub("""[= '">]""",'',url.group())
- #print url_box
- if url_box == '#':
- continue
- if '/' not in url_box:
- continue
- if ':' not in url_box:
- #l1 = yuming + '/' + url_box
- continue
- #print l1
- list_jue.append(url_box)
- #print list_jue
- #print url_box
- for i in urls:
- url = re.search(r"=.*?[ >]",i)
- url_box1 = re.sub("""[= '">]""",'',url.group())
- if 'http' in url_box1:
- continue
- if url_box1 == '#':
- continue
- if '/' not in url_box1:
- continue
- l1 = yuming + url_box1
- list_xiang.append(l1)
- data = list_jue+list_xiang
- return data
- db = get_connection('mysql://root:root@localhost/spider?charset=utf8')
- class urls(Model):
- url = Field(str)
- status = Field(str)
- def search_url(url):
- n = urls.get(urls.c.url == url)
- return n
- def insert_url(url):
- u = search_url(url)
- if u:
- return
- n = urls()
- n.url = url
- n.status = "0"
- n.save()
- def get_url():
- n = urls.get(urls.c.status == "0")
- return n
- def update_url(n):
- n = urls.get(urls.c.id == n.id)
- n.update(status="1")
- n.save()
- def save_newurl(url):
- for u in url:
- insert_url(u)
- print "add %s OK!" %(u)
- #db.metadata.drop_all()
- #db.metadata.create_all()
- #n = urls()
- #n.url = "http://v.hpcasts.com/"
- #n.status = "0"
- #n.save()
- while 1:
- new = get_url()
- try:
- url = geturl(new.url)
- #该片段来自于http://www.codesnippet.cn/detail/260720134841.html
来源: http://www.codesnippet.cn/detail/260720134841.html