大家好,欢迎来到IT知识分享网。
import time
import requests
import base64
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import os
class SearchGoblin():
def __init__(self, kwd, save_path='default', web='https://cn.bing.com/images/trending?FORM=ILPTRD'):
self.website = web
self.keyword = kwd
if save_path == 'default':
path = './'+kwd
if not os.path.exists(path):
os.mkdir(path)
self.save_path = path
else:
self.save_path = save_path
def is_visible(self, driver, loc, timeout=10):
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.LINK_TEXT, loc)))
return True
except TimeoutException:
return False
def bing_log_in(self):
edge_options = webdriver.EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()), options=edge_options)
driver.maximize_window()
driver.get(self.website)
return driver
def keyword_search(self, driver):
driver.find_element(By.XPATH, '//*[@id="sb_form_q"]').send_keys(self.keyword)
driver.find_element(By.XPATH, '//*[@id="sb_form_go"]').click()
while not self.is_visible(driver, '查看更多图片', timeout=3):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# time.sleep(1)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
driver.find_element(By.LINK_TEXT, '查看更多图片').click()
time.sleep(1)
for i in range(10):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(3)
print('拖动第{}次'.format(i+1))
return driver
def img_src_collect(self, driver):
srcs = [item.get_attribute('src') for item in driver.find_elements(By.XPATH, "//img[contains(@class, 'mimg')]")]
print('共爬取{}张图片'.format(len(srcs)))
direct_img = []
indirect_img = []
for src in srcs:
if 'data' in src:
direct_img.append(src)
else:
indirect_img.append(src)
print(len(direct_img))
print(indirect_img)
return direct_img, indirect_img
def decode_base64(self, dir_img):
name_index = 0
for index, dir in enumerate(dir_img):
with open(os.path.join(self.save_path, str(index)+'.jpg'), 'wb') as file:
head, context = dir.split(",")
img = base64.b64decode(context)
file.write(img)
name_index += 0
return name_index
def indir_img_request(self, indir_img, name_index):
start_index = name_index + 1
for img in indir_img:
try:
headers = {
'authority': 'tse2-mm.cn.bing.net',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
'sec-ch-ua-arch': '"x86"',
'sec-ch-ua-bitness': '"64"',
'sec-ch-ua-full-version': '"113.0.1774.50"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-model': '""',
'sec-ch-ua-platform': '"Windows"',
'sec-ch-ua-platform-version': '"15.0.0"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50',
}
response = requests.get(img, headers=headers)
img_download = response.content
with open(os.path.join(self.save_path, str(start_index)+'.jpg'), 'wb') as f:
f.write(img_download)
start_index += 1
except:
pass
def run(self):
driver = self.bing_log_in()
driver = self.keyword_search(driver)
dir_img, ind_img = self.img_src_collect(driver)
final_dir_name_index = self.decode_base64(dir_img)
self.indir_img_request(ind_img, final_dir_name_index)
if __name__ == '__main__':
trial = SearchGoblin(save_path='default', kwd='猫')
trial.run()
使用方法:只需在主函数SearchGoblin内修改kwd参数(这里对猫进行检索),即可对对应关键词图片进行检索
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/116292.html