基于Python的实时爬虫每小时PM2.5等污染物数据

 
# coding:utf-8
import threading
import urllib
import re,sys
import time
import hashlib
import os
  
  
sys.setdefaultencoding = 'utf-8'
  
  
def fetchdata(city):
    print city
    md5 = ''
    while True:
        temp='http://www.pm25.in/'+ city#爬虫的站为：www.pm25.in,只要之前IP没有被该网站封了，就可以爬，假如被封了请申请API
        url = urllib.urlopen(temp)
        text = url.read()
  
  
        shuju = re.findall('<td>(.*?)</td>',text,re.S)#正则pm2.5等污染物数据
        data_time = re.findall("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",text,re.S)#正则寻找当前时间 例如，2016-04-13 20:10:00
  
  
        md52 = hashlib.md5()
        md52.update(data_time[0])
  
        if md52.hexdigest() == md5:
            time.sleep(3600)#自动休眠，每一小时爬一次数据
            continue
        md5 = md52.hexdigest()
        i = 1
        j = 0
        datas = []
        tempdata = open('D:/Python33/py27/data/'+ city + '.txt','a')#在该路径下创建以城市命名的文件，以存储pm2.5数据
        for each in shuju:
            datas.append(each)
            i += 1
            if i > 10:
                datas.append(data_time[0])
                i = 1
                j += 1
                tempdata.write(','.join(datas) + '\\n')
                datas = []
          
  
          
        tempdata.close()
        print city
        print data_time[0]
        print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))#显示当前时间
        time.sleep(3600)
  
if __name__ == "__main__":
    file = open('D:/Python33/py27/data/city/cities.txt','r')#读取城市名字txt，每行为一个   名字
    lines= file.readlines()
    cities = []
    threads = []
    for line in lines:
        cities.append(line.strip())
          
    for city in cities:
        threads.append(threading.Thread(target = fetchdata,args =(city,)))
  
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
#该片段来自于http://www.codesnippet.cn/detail/2111201615148.html

来源: http://www.codesnippet.cn/detail/2111201615148.html

与本文相关文章

暂无,快来抢沙发吧！