使用 selenium 实现谷歌以图搜图爬虫(爬取大图)

实现思路

原理非常简单,就是利用selenium去操作浏览器,获取到想要的链接,然后进行图片的下载,和一般的爬虫无异。

用到的技术:multiprocessing,selenium,xpath,requests

以下按照代码执行的顺序进行讲解。

首先导入需要的包

# coding=utf-8
import base64
import hashlib
import os
import re
import shutil
import time
from multiprocessing import Pool, cpu_count

import requests
import tqdm
from colorama import Fore
from selenium import webdriver
from selenium.common.exceptions import ElementNotVisibleException,
                                        StaleElementReferenceException)
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

定义一个 run)函数,作为入口。这里使用多进程技术,同时打开多个浏览器进行图片爬取。

def run):
    num_process = cpu_count) # 进程数设置为cpu核心数
    pool = Poolnum_process) # 建立一个进程池
    filelist = []
    upload = r"./upload" # 需要进行上传的图片文件夹
    getfilelistupload, filelist)  # 递归查找文件夹里面所有的图片文件
    result = partitionfilelist, num_process) # 将图片文件列表平均分为几个list,每个进程跑一部分
    pool.map_asyncdownload_task, result) # 下载任务丢进进程池
    pool.close) # 不再允许加入进程池
    pool.join) # 等待进程完成

其中 getfilelist)函数是递归查找,工作中用得很多了。

EXTEND = [".bmp", ".jpg", ".jpeg", ".tif", ".tiff",
          ".jfif", ".png", ".gif", ".iff", ".ilbm"]
          
def is_imgimg_path):
    # 根据后缀判断是否为图片
    ext = os.path.splitextimg_path)[1]
    if ext in EXTEND:
        return True
    else:
        return False
        
def getfilelistpath, filelist):
    file = os.listdirpath)
    for im_name in file:
        if os.path.isdiros.path.joinpath, im_name)):
            getfilelistos.path.joinpath, im_name), filelist)
        else:
            if is_imgim_name):
                name = os.path.joinpath, im_name)
                filelist.appendname)

partition)函数用于将一个列表均分为几份,以便实现多进程。

def partitionls, size):
    num_per_list = lenls)//size
    result = []
    if num_per_list*size == lenls):
        for i in rangesize):
            result.appendls[num_per_list*i:num_per_list*i+1)])
    else:
        for i in rangesize-1):
            result.appendls[num_per_list*i:num_per_list*i+1)])
        result.appendls[num_per_list*size-1):])
    return result

download_task)为具体的下载任务,一个task实例化一个GoogleSearcher类,遍历自己的图片列表进行以图搜图。

def download_taskfilelist):
    searcher = GoogleSearcher
        download=r"./download")
    for file in filelist:
        searcher.simple_file_runfile)  # 上传单张图并进行以图搜图

GoogleSearcher类比较长,在注释中进行讲解。

USERNAME = os.environ['USERNAME']
class GoogleSearcher:
    def __init__self, download="download", sleep_time=1):
        super).__init__)
        self._download = download # 下载文件夹
        self.sleep_time = sleep_time  # 下载页面时等待时间
        self.header = {
            "User-Agent": "Mozilla/5.0 Windows NT 10.0; Win64; x64) AppleWebKit/537.36 KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}

        os.makedirsself._download, exist_ok=True)  # 创建下载文件夹

        self.option = webdriver.ChromeOptions)
        # self.option.add_argument"--user-data-dir=" + f"C:/Users/{USERNAME}/AppData/Local/Google/Chrome/User Data/")
        # self.option.add_argument"headless")  # if use headless, may failed.
        self.option.add_argument"disable-gpu")
        self.driver = webdriver.Chromeoptions=self.option) # 以上为浏览器对象创建

    def upload_img_get_htmlself, file):
    	# 上传图片并转到图片列表页面
        print
            f"{Fore.GREEN} Begin to upload image {os.path.splitfile)[1]} {Fore.RESET}")
        self.driver.get"https://www.google.com/imghp")

        # 等待相机按钮出现
        condition_1 = EC.visibility_of_element_located
            By.CLASS_NAME, "LM8x9c"))
        WebDriverWaitself.driver, timeout=20,
                      poll_frequency=0.5).untilcondition_1)
        # 相机按钮出现后点击
        image_button = self.driver.find_element_by_class_name"LM8x9c")
        image_button.send_keysKeys.ENTER)

        # 等待出现上传图片字样
        condition_2 = EC.visibility_of_element_located
            By.ID, "dRSWfb"))
        WebDriverWaitself.driver, timeout=20, poll_frequency=0.5).until
            condition_2)

        # 点击上传图片
        upload = self.driver.find_element_by_xpath'//*[@id="dRSWfb"]/div/a')
        upload.send_keysKeys.ENTER)

        # 找到上传图片的控件
        condition_3 = EC.visibility_of_element_located
            By.ID, 'awyMjb'))
        WebDriverWaitself.driver, timeout=10, poll_frequency=0.5).until
            condition_3)
        input_ = self.driver.find_element_by_id'awyMjb')

        # 因为上传图片的控件是一个input,直接将文件send就行
        input_.send_keysfile)
        printf"{Fore.GREEN} uploaded {Fore.RESET}")

        # 页面转向另一页
        condition_4 = EC.visibility_of_element_located
            By.XPATH, '//*[@id="top_nav"]'))
        WebDriverWaitself.driver, timeout=20,
                      poll_frequency=0.5).untilcondition_4)
        # 等待片刻
        time.sleepself.sleep_time)

        # printdriver.current_url)
        # printdriver.page_source)
        printf"{Fore.GREEN} Finish download source code{Fore.RESET}")
        return self.driver.page_source

    def highlightself, element):
        self.driver.execute_script
            "arguments[0].setAttribute'style', arguments[1]);", element, "background: yellow; border: 2px solid red;")

    def wait_and_clickself, xpath):
        #  Sometimes click fails unreasonably. So tries to click at all cost.
        try:
            w = WebDriverWaitself.driver, 15)
            elem = w.untilEC.element_to_be_clickableBy.XPATH, xpath)))
            elem.click)
            self.highlightelem)
        except Exception as e:
            print'Click time out - {}'.formatxpath))
            print'Refreshing browser...')
            self.browser.refresh)
            time.sleep2)
            return self.wait_and_clickxpath)
        return elem

    def get_extension_from_linkself, link, default='jpg'):
    # 获取文件后缀
        splits = strlink).split'.')
        if lensplits) == 0:
            return default
        ext = splits[-1].lower)
        if ext == 'jpg' or ext == 'jpeg':
            return 'jpg'
        elif ext == 'gif':
            return 'gif'
        elif ext == 'png':
            return 'png'
        else:
            return default

    def base64_to_objectself, src):
    # base64 解码
        header, encoded = strsrc).split',', 1)
        data = base64.decodebytesbytesencoded, encoding='utf-8'))
        return data

    def download_imagesself, links, download_dir):
    # 下载图片
        total = lenlinks)
        for index, link in enumeratelinks):
            try:
                if lenlink) < 100:
                    print'Downloading {} : {} / {}'.formatlink, index + 1, total))
                else:
                    print
                        'Downloading {} : {} / {}'.formatlink[:100], index + 1, total))
                        # 链接过长,只打印部分
                if strlink).startswith'data:image/jpeg;base64'):
                # base64编码的jpg图片
                    response = self.base64_to_objectsrc=link)
                    ext = 'jpg'
                    is_base64 = True
                elif strlink).startswith'data:image/png;base64'):
                # base64编码的png图片
                    response = self.base64_to_objectsrc=link)
                    ext = 'png'
                    is_base64 = True
                else:
                # 图片超链接
                    response = requests.getlink, stream=True, timeout=5)
                    ext = self.get_extension_from_linklink=link)
                    is_base64 = False

                path = os.path.joindownload_dir, strindex).zfill4)+"."+ext)
                try:
                    with openpath, "wb") as f:
                    # base64图片和超链接图片两种保存方法
                        if is_base64:
                            f.writeresponse)
                        else:
                            shutil.copyfileobjresponse.raw, f)
                except Exception as e:
                    print'Save failed - {}'.formate))

                del response
            except Exception as e:
                print'Download failed - ', e)
                continue

    def get_full_resolution_linksself):
        print'[Full Resolution Mode]')
        time.sleep1)
        elem = self.driver.find_element_by_tag_name"body")
        print'Scraping links')
        self.wait_and_click'//div[@data-ri="0"]')
        time.sleep1)
        links = []
        count = 1
        last_scroll = 0
        scroll_patience = 0
        while True:
            try:
                xpath = '//div[@id="islsp"]//div[@class="v4dQwb"]'
                div_box = self.driver.find_elementBy.XPATH, xpath)
                self.highlightdiv_box)
                xpath = '//img[@class="n3VNCb"]'
                img = div_box.find_elementBy.XPATH, xpath)
                self.highlightimg)
                xpath = '//div[@class="k7O2sd"]'
                loading_bar = div_box.find_elementBy.XPATH, xpath)
                # 等待图片加载,如果加载不完,获取到的是 base64 编码的图片
                while strloading_bar.get_attribute'style')) != 'display: none;':
                    time.sleep0.1)
                src = img.get_attribute'src')
                if src is not None:
                    links.appendsrc)
                    if lensrc) < 100:
                        print'%d: %s' % count, src))
                    else:
                        print'%d: %s' % count, src[:100])) # 如果太长,只打印一部分
                    count += 1
            except StaleElementReferenceException:
                pass
            except Exception as e:
                print
                    '[Exception occurred while collecting links from google_full] {}'.formate))
            scroll = self.driver.execute_script"return window.pageYOffset;") # 页面滚动的位置
            if scroll == last_scroll:
            # 页面滚动1
                scroll_patience += 1
            else:
                scroll_patience = 0
                last_scroll = scroll
            if scroll_patience >= 30:
            #页面滚动30,停止
                break
            elem.send_keysKeys.RIGHT)
        links = listdict.fromkeyslinks)) # 链接去重
        print'Collect links done. Total: {}'.formatlenlinks)))
        return links

    def simple_file_runself, img):
        # 上传图片并进行搜索
        img_name = os.path.splitextos.path.splitimg)[1])[0] # 图片名
        parent_name = os.path.splitos.path.splitimg)[0])[-1] # 图片的父级名字,用来区分图片的类别
        print"--> Processing image:  {}  ".formatimg_name))
        download_dir = os.path.joinself._download, parent_name, img_name)
        os.makedirsdownload_dir, exist_ok=True)  
        html_source = self.upload_img_get_htmlimg)  # 上传图片,到搜索结果页
        similar_img_href = self.driver.find_element_by_xpath
            '//div[@class="e2BEnf U7izfe"]/h3/a')
        similar_img_href.click)  # 查找“类似图片”的链接并点击,进入图片列表页
        links = self.get_full_resolution_links)  # 将所有图片的大图链接进行收集
        self.download_imageslinks, download_dir)  # 下载这些大图
        print"{}Image {} finished
{}".format
            Fore.GREEN, img_name, Fore.RESET))

整个流程就跟打开浏览器进行操作一样,难点在于如何控制速度,不被谷歌反爬,不然出现谷歌验证码,破解是不可能的,就要帮它免费打码了。

有何用途

当你需要训练一个图片分类的模型,手头上图片有限,那就可以用这个方法,每一张图都找跟它相似的,轻轻松松就把训练集扩大了几十倍(理想情况,不被反爬的话)。

参考

https://github.com/YoongiKim/AutoCrawler
https://github.com/Cyberist-Edgar/Google_Image_Searcher

Published by

风君子

独自遨游何稽首 揭天掀地慰生平