(数据科学学习手札 56) 利用机器学习破解大众点评文字反爬

一, 引言

爬取过大众点评的朋友应该会遇到这样的问题, 在网页中看起来正常的文字, 在其源代码中变成了下面这样:

究其原因, 是因为大众点评在内容上设置的特别的反爬机制, 与某些网站替换底层字体文件不同, 大众点评使用随机替换的 SVG 图片来替换对应位置的汉字内容, 使得我们使用常规的手段无法获取其网页中完整的文字内容, 经过观察我发现, 所有可以被 SVG 图像替换的文字都保存在下图所示的地址中:

打开该页面后可以发现其包含了所有可以被 SVG 替换的文字:

def OfferLocalBrowser(headless=False):
    '''
    这个函数用于提供自动登录大众点评的 Chrome 浏览器
    :param headless: 是否使用无头 Chrome
    :return: 返回浏览器对象
    '''
    option = webdriver.ChromeOptions()
    option.add_argument(r'user-data-dir=C:\Users\hp\AppData\Local\Google\Chrome\User Data')
    if headless:
        option.add_argument('--headless')
    browser = webdriver.Chrome(options=option)
    return browser
def CollectDataset(targetUrl,low=3,high=6,page=3,refreshTime=3):
    '''
    :param targetUrl: 传入可翻页的任意商铺评论页面地址
    :param low: 设置随机睡眠防 ban 的随机整数下限
    :param high: 设置随机睡眠防 ban 的随机整数上限
    :param page: 设置最大翻页次数
    :param refreshTime: 设置每个页面重复刷新的时间
    :return: 返回收集到的汉字列表和编码列表
    ''''''初始化用于存放所有采集到的样本词和对应的样本词编码的列表, CL 用于存放所有编码, WL 用于存放所有词, 二者顺序一一对应'''
    CL,WL = [],[]
    browser = OfferLocalBrowser(headless=False)
    for p in tqdm(range(1,page+1)):
        for r in range(refreshTime):
            '''访问目标网页'''
            html = browser.get(url=targetUrl.format(p))
            if '3s 未完成验证, 请重试.' in str(browser.page_source):
                ii = input()
            '''将原始网页内容解码'''
            HTML = browser.page_source
            '''解析网页内容'''
            obj = BeautifulSoup(HTML,'lxml')
            '''提取评论部分内容以方便之后对评论汉字和 SVG 图像对应编码的提取'''
            raw_comment = obj.find_all('div',{'class':'review-words Hide'})
            '''初始化列表容器以有顺序地存放符合汉字或 SVG 标签格式的内容'''
            base_Comment = []
            '''利用正则提取符合汉字内容规则的元素'''
            firstList = re.findall('(<span class="[a-z0-9]+">)|([\u4e00-\u9fa5]{1})',str(raw_comment))
            '''构造该页面中长度守恒的评论片段列表'''
            actualList = []
            '''按顺序将所有汉字片段和 < span > 标签片段拼接在一起'''
            for i in range(len(firstList)):
                for j in range(2):
                    if firstList[i][j] != '':
                        actualList.append(firstList[i][j])
            '''打印当前界面所有评论片段的长度'''
            print(len(actualList))
            '''在每个页面的第一次访问时初始化汉字列表和编码列表'''
            if r == 0:
                wordList = ['' for i in range(len(actualList))]
                codeList = ['' for i in range(len(actualList))]
            '''将 actualList 中粗糙的 < span > 片段清洗成纯粹的编码片段, 汉字部分则原封不定保留, 并分别更新 wordList 和 codeList'''
            for index in range(len(actualList)):
                if '<' in actualList[index]:
                    codeList[index] = re.findall('class="([a-z0-9]+)"',actualList[index])[0]
                else:
                    wordList[index] = actualList[index]
            '''随机睡眠防 ban'''
            time.sleep(np.random.randint(low,high))
        '''将结束重复采集的当前页面中发现的所有汉字 - 编码对应规则列表与先前的规则列表合并'''
        CL.extend(codeList)
        WL.extend(wordList)
        print('总列表长度:{}'.format(len(CL)))
    browser.quit()
    return WL,CL

-_-!):
def CreateXandY(wordList,codeList,CSSUrl,SvgUrl):
    '''
    这个函数用于传入朴素的汉字列表, 编码列表, CSS 页面地址, SVG 页面地址来输出规整的 numpy 多维数组格式的自变量 X, 以及标签 Y
    :param wordList: 汉字列表
    :param codeList: 编码列表
    :param cssUrl: CSS 页面地址
    :param SvgUrl: SVG 页面地址
    :return: 返回自变量 X, 因变量 Y
    '''
    def GetSvgWordIpx(SvgUrl=SvgUrl):
        '''
        这个函数用于爬取 SVG 页面, 并返回所需内容
        :param SvgUrl: SVG 页面地址
        :return: 单个汉字为键, 上面所列四个属性为汉字键对应嵌套的字典中对应值的字典文件
        ''''''访问 SVG 页面'''
        SvgWord = requests.get(SvgUrl).content.decode()
        '''初始化汉字 - 候选因变量字典'''
        Svg2Label = {}
        '''提取 SVG 页面中所有汉字所在的 text 标签内容列表, 每个列表对应页面中一行文字'''
        rawList = re.findall('(<textPath xlink:href="#[0-9]+"textLength="[0-9]+">.*?</textPath>)', SvgWord)
        '''抽取每个汉字及其对应的四个候选因变量'''
        for row in range(len(rawList)):
            wordPreList = re.findall('[\u4e00-\u9fa5]{1}', rawList[row])
            for Word in wordPreList:
                Svg2Label[Word] = {
                    'RowIndex': [],
                    'ColIndex': [],
                    'textLength': [],
                    'Nchar': []
                }
                Svg2Label[Word]['RowIndex'] = row + 1
                Svg2Label[Word]['ColIndex'] = wordPreList.index(Word) + 1
                Svg2Label[Word]['textLength'] = int(re.search('textLength="(.*?)"', rawList[row]).group(1))
                Svg2Label[Word]['Nchar'] = len(wordPreList)
        return Svg2Label
    '''访问 CSS 页面'''
    CodeWithIpx = requests.get(cssUrl).content.decode()
    '''初始化编码 - px 值字典'''
    code2ipx = {}
    '''初始化针对样本数据的编码 - 汉字字典'''
    code2word = {}
    '''从样本中抽取采集到的确切的汉字 - 编码关系'''
    for code, Word in tqdm(zip(codeList, wordList)):
        if code != ''and Word !='':
            code2ipx
= re.search(
                '.%s{background:-(.*?).0px -(.*?).0px;}' % code, CodeWithIpx).groups()
            code2word
= Word
    Svg2Label = GetSvgWordIpx()
    '''生成自变量和因变量'''
    X = []
    for key, value in code2ipx.items():
        X.append([int(value[0]), int(value[1])])
    X = np.array(X)
    Y = []
    for key, value in code2ipx.items():
        Y.append([Svg2Label[code2word[key]]['ColIndex'],
                  Svg2Label[code2word[key]]['Nchar'],
                  Svg2Label[code2word[key]]['RowIndex'],
                  Svg2Label[code2word[key]]['textLength']])
    Y = np.array(Y)
    return X,Y,Svg2Label,CodeWithIpx

def GetModels(X,Y):
    '''
    :param X: 因变量
    :param Y: 自变量
    :return: 用于预测行下标的模型 1 和预测列下标的模型 2
    '''    if'model1.m' not in os.listdir():
        '''这个模型的因变量为对应汉字的行下标'''
        model1 = DecisionTreeClassifier().fit(X, Y[:, 2])
        '''这个模型的因变量是对应汉字的列下标'''
        model2 = DecisionTreeClassifier().fit(X, Y[:, 0])
        '''本地持久化保存训练好的模型'''
        joblib.dump(model1,'model1.m')
        joblib.dump(model2,'model2.m')
    else:
        model1,model2 = joblib.load('model1.m'),joblib.load('model2.m')
    return model1,model2

def Translate(s,baseDF,model1,model2):
    '''
    这个函数用于对汉字和 SVG 标签格式混杂的字符串进行预测解码
    :param s: 待解码的字符串
    :param baseDF: 存放所有汉字与其行列下标的数据框
    :param model1: 模型 1
    :param model2: 模型 2
    :return: 预测解码结果
    '''    result =''
    for ele in s:
        for u in range(2):
            if ele[u] != ''and'<' in ele[u]:
                row_ = model1.predict(np.array(
                    [int(re.search('.%s{background:-(.*?).0px -(.*?).0px;}' % re.search('<span class="([a-z0-9]+)">',ele[u]).group(1), CodeWithIpx).groups()[i]) for i in
                     range(2)]).reshape(1, -1))
                col_ = model2.predict(np.array(
                    [int(re.search('.%s{background:-(.*?).0px -(.*?).0px;}' % re.search('<span class="([a-z0-9]+)">',ele[u]).group(1), CodeWithIpx).groups()[i]) for i in
                     range(2)]).reshape(1, -1))
                answer = baseDF['字符'][(baseDF['Row'] == row_.tolist()[0]) & (baseDF['Col'] == col_.tolist()[0])].tolist()[0]
                result += answer
            else:
                result += ele[u]
    return result

baseDF = pd.DataFrame({'字符': [key for key in Svg2Label.keys()],
                                       'Row': [Svg2Label[key]['RowIndex'] for key in Svg2Label.keys()],
                                       'Col': [Svg2Label[key]['ColIndex'] for key in Svg2Label.keys()]})

'''测试'''
wordList,codeList = CollectDataset(targetUrl = 'http://www.dianping.com/shop/72452707/review_all/p{}?queryType=sortType&queryVal=latest',
                                   low = 3,
                                   high = 6,
                                   page = 3,
                                   refreshTime = 3)
X,Y,Svg2Label,CodeWithIpx = CreateXandY(wordList=wordList,codeList=codeList,
                  cssUrl = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/c26b1e06f361cadaa823f1b76642e534.css',
                  SvgUrl = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/d6a6b2d601063fb185d7b89931259d79.svg')
model1,model2 = GetModels(X,Y)
browser = OfferLocalBrowser()
browser.get('http://www.dianping.com/shop/124475710/review_all?queryType=sortType&&queryVal=latest')
obj = BeautifulSoup(browser.page_source,'lxml')
rawCommentList = obj.find_all('div',{'class':'review-words'})
baseDF = pd.DataFrame({'字符': [key for key in Svg2Label.keys()],
                                       'Row': [Svg2Label[key]['RowIndex'] for key in Svg2Label.keys()],
                                       'Col': [Svg2Label[key]['ColIndex'] for key in Svg2Label.keys()]})
for i in range(len(rawCommentList)):
    s = re.findall('(<span class="[a-z0-9]+">)|([\u4e00-\u9fa5]{1})',str(rawCommentList[i]))
    print(Translate(s,baseDF,model1,model2))

来源: https://www.cnblogs.com/feffery/p/10617403.html

与本文相关文章

暂无,快来抢沙发吧！