python webdriver基于bin搜索引擎根据关键词爬取相关图片

大家好，欢迎来到IT知识分享网。

import time
import requests
import base64
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import os


class SearchGoblin():

    def __init__(self, kwd, save_path='default', web='https://cn.bing.com/images/trending?FORM=ILPTRD'):
        self.website = web
        self.keyword = kwd
        if save_path == 'default':
            path = './'+kwd
            if not os.path.exists(path):
                os.mkdir(path)
            self.save_path = path

        else:
            self.save_path = save_path

    def is_visible(self, driver, loc, timeout=10):
        try:
            WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.LINK_TEXT, loc)))
            return True
        except TimeoutException:
            return False
    def bing_log_in(self):
        edge_options = webdriver.EdgeOptions()
        edge_options.use_chromium = True
        edge_options.add_argument('--disable-blink-features=AutomationControlled')
        driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()), options=edge_options)
        driver.maximize_window()
        driver.get(self.website)

        return driver

    def keyword_search(self, driver):
        driver.find_element(By.XPATH, '//*[@id="sb_form_q"]').send_keys(self.keyword)
        driver.find_element(By.XPATH, '//*[@id="sb_form_go"]').click()
        while not self.is_visible(driver, '查看更多图片', timeout=3):
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            # time.sleep(1)
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        driver.find_element(By.LINK_TEXT, '查看更多图片').click()
        time.sleep(1)
        for i in range(10):
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(3)
            print('拖动第{}次'.format(i+1))

        return driver

    def img_src_collect(self, driver):
        srcs = [item.get_attribute('src') for item in driver.find_elements(By.XPATH, "//img[contains(@class, 'mimg')]")]
        print('共爬取{}张图片'.format(len(srcs)))

        direct_img = []
        indirect_img = []
        for src in srcs:
            if 'data' in src:
                direct_img.append(src)
            else:
                indirect_img.append(src)
        print(len(direct_img))
        print(indirect_img)
        return direct_img, indirect_img

    def decode_base64(self, dir_img):
        name_index = 0
        for index, dir in enumerate(dir_img):
            with open(os.path.join(self.save_path, str(index)+'.jpg'), 'wb') as file:
                head, context = dir.split(",")
                img = base64.b64decode(context)
                file.write(img)
            name_index += 0
        return name_index

    def indir_img_request(self, indir_img, name_index):
        start_index = name_index + 1
        for img in indir_img:
            try:
                headers = {
                    'authority': 'tse2-mm.cn.bing.net',
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                    'cache-control': 'max-age=0',
                    'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
                    'sec-ch-ua-arch': '"x86"',
                    'sec-ch-ua-bitness': '"64"',
                    'sec-ch-ua-full-version': '"113.0.1774.50"',
                    'sec-ch-ua-mobile': '?0',
                    'sec-ch-ua-model': '""',
                    'sec-ch-ua-platform': '"Windows"',
                    'sec-ch-ua-platform-version': '"15.0.0"',
                    'sec-fetch-dest': 'document',
                    'sec-fetch-mode': 'navigate',
                    'sec-fetch-site': 'none',
                    'sec-fetch-user': '?1',
                    'upgrade-insecure-requests': '1',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50',
                }

                response = requests.get(img, headers=headers)
                img_download = response.content
                with open(os.path.join(self.save_path, str(start_index)+'.jpg'), 'wb') as f:
                    f.write(img_download)
                start_index += 1
            except:
                pass

    def run(self):
        driver = self.bing_log_in()
        driver = self.keyword_search(driver)
        dir_img, ind_img = self.img_src_collect(driver)
        final_dir_name_index = self.decode_base64(dir_img)
        self.indir_img_request(ind_img, final_dir_name_index)

if __name__ == '__main__':
    trial = SearchGoblin(save_path='default', kwd='猫')
    trial.run()

使用方法：只需在主函数SearchGoblin内修改kwd参数（这里对猫进行检索），即可对对应关键词图片进行检索

免责声明：本站所有文章内容,图片，视频等均是来源于用户投稿和互联网及文摘转载整编而成，不代表本站观点，不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益，请在线联系站长,一经查实,本站将立刻删除。本文来自网络,若有侵权，请联系删除，如若转载，请注明出处：https://haidsoft.com/116292.html

python webdriver基于bin搜索引擎根据关键词爬取相关图片

相关推荐

发表回复