- import pymysql
- from scrapy.exceptions import DropItem
- import time
- class ErshouchePipeline(object):
- def __init__(self):
- self.conn = pymysql.connect(
- host = '127.0.0.1',
- port = 3306,
- user = 'root',
- passwd = 'mlpythonlmoi',
- db = 'ershouche',
- charset = 'utf8'
- )
- self.cusor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
- sql1 = "select 路由网址 from 二手车之家"
- result = self.cusor.execute(sql1)# 读取已经爬取的数据 url
- # print(result)
- temp = self.cusor.fetchall()# 返回查询到的所有记录
- print('返回查询得到的记录:',temp)
- self.url_list = []
- for i in temp:
- self.url_list.append(i['路由网址'])
- print('存在的:',self.url_list)
- def process_item(self, item, spider):
- if item['car_url'] not in self.url_list:
- sql = "insert into 二手车之家 values(Null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
- lst = (item['city'],item['trademark'],item['model'],item['colour'],item['price'],item['purpose'],
- item['vehicle_condition'],item['drive_mode'],item['Truck_kilometer'],item['car_license'],
- item['Stop_displacemen'],item['year_jian_due'],item['insurance_policy_matures'],item['assurance_due'],
- item['emission_standard'],item['guohu_number'],item['maintenance'],item['car_url'])
- self.cusor.execute(sql,lst)
- self.conn.commit()
- else:
- raise DropItem('该 item 数据库中已经存在!')
- return item
- def close_spider(self, spider):
- self.cusor.close()
- self.conn.close()# 关闭连接
- print("操作结束!")
- print('结束时间:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
来源: http://www.bubuko.com/infodetail-3002228.html