多线程 URL 采集

 
#coding=gb2312
import re
import urllib
import time
import threading
#进程锁
lock = threading.Lock()
#存放记录条数
count =0
#存放url信息
url=[]
 
#第一次正则匹配搜索结果
rc = re.compile(r'(<span class="g">)(.*)[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}',re.I)
 
#第二次正则匹配搜索结果
rcr = re.compile(r'^>(.*)(/ )?(\\s)?',re.I)
#替换重复的搜索结果
#多线程搜索
class SeoRearch(threading.Thread):
    def __init__(self,wd='dedecms',n=0):
        self.wd = wd
        self.n = n
        super(SeoRearch,self).__init__()
    def run(self):
        global count
        file = urllib.urlopen("http://www.baidu.com/s?rn=50&pn="+str(self.n)+"&q1="+self.wd).read()
        for i in rc.finditer(file):
            f=i.group(0)
            temp = f.replace('<b>','').replace('</b>','')
            time.sleep(1)
            t = temp[16:temp.find('/')]
            print t
            lock.acquire()
            url.append(t)
            count += 1
            lock.release()
if __name__== '__main__':
    s = raw_input("请输入你要查询的关键字")
    Page = raw_input("请输入要查询的页数每页50条")
    file1 = open('record.txt','w')
    threads=[]
    for i in range(int(Page)):
        t = SeoRearch(wd=s,n=i*50)
        t.start()
        threads.append(t)
     
    for t2 in threads:
        t2.join()
     
    list_insteah = list(set(url))
    time.sleep(2)
    for i in list_insteah:
        file1.write(i+"\\n")
         
    file1.close()
    print "你搜索的关键字是:%s,搜索的页数是:%s,总计一共 %s 条信息" % (s,Page,count)
#该片段来自于http://www.codesnippet.cn/detail/250220148782.html

来源: http://www.codesnippet.cn/detail/250220148782.html

与本文相关文章

暂无,快来抢沙发吧！