爬取 YY 评级信息

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 爬取 YY 评级基本信息. py
# @Author: lattesea
# @Date  : 2019/10/7
# @Desc  :
import requests
import JSON
import CSV
from fake_useragent import UserAgent
import time
import random
class YYpingjiSpider(object):
    def __init__(self):
        self.url = 'https://api.ratingdog.cn/v1/search?limit=10&offset={}&type=3&qtext=&filter={}&_=1570391570681'
        self.url2 = 'https://api.ratingdog.cn/v1/GetIssuerInfo?IssuerID={}&IssuerType=1001'
        self.url3 = 'https://api.ratingdog.cn/v1/GetIssuerInfo?IssuerID={}&IssuerType=1002'
    def get_headers(self):
        ua = UserAgent()
        headers = {
            "Accept": "application/json, text/plain, */*",
            "Origin": "https://www.ratingdog.cn",
            "Referer": "https://www.ratingdog.cn/",
            "Sec-Fetch-Mode": "cors",
            "User-Agent": ua.random
        }
        return headers
    def parse_IssuerID_IssuerType(self, url):
        IssuerID_list = []
        html_json = requests.get(url=url, headers=self.get_headers()).text
        html_py = JSON.loads(html_json)
        for i in html_py['rows']:
            IssuerID_list.append((i['IssuerID'], i['IssuerType']))
        print(IssuerID_list)
        return IssuerID_list
    def parse_basic_message_1002(self, IssuerID):
        url = self.url3.format(IssuerID)
        basic_message = {}
        html_json = requests.get(url=url, headers=self.get_headers()).text
        html_py = JSON.loads(html_json)
        for i in html_py['rows']:
            basic_message['IssuerName'] = html_py['rows']['IssuerName']
            basic_message['CorporateRating'] = html_py['rows']['CorporateRating']
            basic_message['RatingAgency'] = html_py['rows']['RatingAgency']
            basic_message['Holder'] = html_py['rows']['Holder']
            basic_message['Industry'] = html_py['rows']['Industry']
            basic_message['Nature'] = html_py['rows']['Nature']
            basic_message['YYRating'] = html_py['rows']['YYRating']
            basic_message['IssuerType'] = html_py['rows']['IssuerType']
            basic_message['CreditAnalysis'] = html_py['rows']['CreditAnalysis']
            basic_message['PlatformImportance'] = html_py['rows']['CtExtendInfo']['PlatformImportance']
            basic_message['PrincipalBusiness'] = html_py['rows']['CtExtendInfo']['PrincipalBusiness']
            basic_message['GDP'] = html_py['rows']['CtExtendInfo']['GDP']
            basic_message['Revenue'] = html_py['rows']['CtExtendInfo']['Revenue']
            basic_message['YYRatio'] = html_py['rows']['CtExtendInfo']['YYRatio']
            basic_message['IssuerCity'] = html_py['rows']['CtExtendInfo']['IssuerCity']
            basic_message['ADLevel'] = html_py['rows']['CtExtendInfo']['ADLevel']
        print(basic_message)
        return basic_message
    def parse_basic_message_1001(self, IssuerID):
        url = self.url2.format(IssuerID)
        basic_message = {}
        html_json = requests.get(url=url, headers=self.get_headers()).text
        html_py = JSON.loads(html_json)
        for i in html_py['rows']:
            basic_message['IssuerName'] = html_py['rows']['IssuerName']
            basic_message['CorporateRating'] = html_py['rows']['CorporateRating']
            basic_message['RatingAgency'] = html_py['rows']['RatingAgency']
            basic_message['Holder'] = html_py['rows']['Holder']
            basic_message['Industry'] = html_py['rows']['Industry']
            basic_message['Nature'] = html_py['rows']['Nature']
            basic_message['YYRating'] = html_py['rows']['YYRating']
            basic_message['IssuerType'] = html_py['rows']['IssuerType']
            basic_message['CreditAnalysis'] = html_py['rows']['CreditAnalysis']
            basic_message['YYIndustry'] = html_py['rows']['CyExtendInfo']['YYIndustry']
            basic_message['YYIndustryId'] = html_py['rows']['CyExtendInfo']['YYIndustryId']
            basic_message['IndustrylStatus'] = html_py['rows']['CyExtendInfo']['IndustrylStatus']
            basic_message['ShareholderBackground'] = html_py['rows']['CyExtendInfo']['ShareholderBackground']
            basic_message['OperatingStatus'] = html_py['rows']['CyExtendInfo']['OperatingStatus']
            basic_message['FinancialStatus'] = html_py['rows']['CyExtendInfo']['FinancialStatus']
            basic_message['Focus'] = html_py['rows']['CyExtendInfo']['Focus']
        print(basic_message)
        return basic_message
    def save_csv_1001(self, result):
        keyword_list1 = ['IssuerName', 'CorporateRating', 'RatingAgency', 'Holder', 'Industry', 'Nature', 'YYRating',
                         'IssuerType', 'CreditAnalysis', 'YYIndustry', 'YYIndustryId', 'IndustrylStatus',
                         'ShareholderBackground', 'OperatingStatus', 'FinancialStatus', 'Focus']
        with open('1001.csv', 'a', newline='') as f:
            writer = CSV.DictWriter(f, keyword_list1)
            # for row in result:
            writer.writerow(result)
    def save_csv_1002(self, result):
        keyword_list2 = ['IssuerName', 'CorporateRating', 'RatingAgency', 'Holder', 'Industry', 'Nature', 'YYRating',
                         'IssuerType', 'CreditAnalysis', 'PlatformImportance', 'PrincipalBusiness', 'PrincipalBusiness',
                         'GDP', 'Revenue', 'YYRatio', 'IssuerCity', 'ADLevel']
        with open('1002.csv', 'a', newline='') as f:
            writer = CSV.DictWriter(f, keyword_list2)
            # for row in result:
            writer.writerow(result)
    def run(self):
        # self.parse_IssuerID()
        # self.parse_basic_message_1001()
        for i in range(0, 4631, 20):
            url = self.url.format(i)
            IssuerID_IssuerType = self.parse_IssuerID_IssuerType(url)
            for j in IssuerID_IssuerType:
                if j[1] == '产业':
                    result = self.parse_basic_message_1001(j[0])
                    self.save_csv_1001(result)
                elif j[1] == '城投':
                    result = self.parse_basic_message_1002(j[0])
                    self.save_csv_1002(result)
                time.sleep(random.uniform(1, 4))
if __name__ == '__main__':
    spider = YYpingjiSpider()
    spider.run()
该网站主要是访问频率太高会被封账号
来源: http://www.bubuko.com/infodetail-3259460.html
与本文相关文章

暂无,快来抢沙发吧！