大家好,欢迎来到IT知识分享网。
1.从网站找规律
爬虫图片源
F12–>network–>XHR,然后图片往下拉,XHR下会出现请求内容,点击进入Header找url请求规律
2.具体下载代码如下
#!/usr/bin/env Python
# coding=utf-8
import json
import itertools
import urllib
import requests
import os
import re
import sys
# 生成网址列表
def buildUrls(word):
word = urllib.parse.quote(word)
url = r"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn={pn}&rn=30&gsm=78&1524745446180="
urls = (url.format(word=word, pn=x*30) for x in range(60))
return urls
# 解析JSON获取图片URL
re_url = re.compile(r'"thumbURL":"(.*?)"')
def resolveImgUrl(html):
imgUrls = re_url.findall(html)
return imgUrls
def downImg(imgUrl, dirpath, imgName):
filename = os.path.join(dirpath, imgName)
try:
res = requests.get(imgUrl, timeout=15)
if str(res.status_code)[0] == "4":
print(str(res.status_code), ":", imgUrl)
return False
except Exception as e:
print(" This is Exception:", imgUrl)
print(e)
return False
with open(filename, "wb") as f:
f.write(res.content)
return True
def mkDir(dirName):
dirpath = os.path.join(sys.path[0], dirName)
if not os.path.exists(dirpath):
os.mkdir(dirpath)
return dirpath
if __name__ == '__main__':
word = '鸟类'
dirpath = mkDir("D:/getdata/down/baidu/"+word)
urls = buildUrls(word)
index = 0
for url in urls:
print("requesting:", url)
html = requests.get(url, timeout=10).content.decode('utf-8')
imgUrls = resolveImgUrl(html)
if len(imgUrls) == 0: # 没有图片则结束
break
for url in imgUrls:
if downImg(url, dirpath, "baidu"+str(index) + ".jpg"):
index += 1
print("Downloaded %s picture" % index)
bingying爬虫
import requests from bs4 import BeautifulSoup import json import eventlet import os urlshu = 1 #url中first = urlshu pictureshu = 1 #图片下载时的名字(加上异常图片的第几张图片) soupshu = 0 #每35张soup列表中第soupshu个 whileshu = 35 #用于while循环的数(因为每个页面35张图片) save_path = 'E:\Learn\data\Crawlerdata_people\\' #图片保存路径 if os.path.exists(save_path) == False: os.mkdir(save_path) #创建文件夹'图片' url1 = 'https://cn.bing.com/images/async?q=' url2 = '&first=%d&count=35&cw=1177&ch=737&relp=35&tsc=ImageBasicHover&datsrc=I&layout=RowBased&mmasync=1' head1 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64' } #有35张图片的网页的请求标头 head2 = { 'Cookie': 'Hm_lvt_d60c24a3d320c44bcdbc61f703=,', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64' } #具体图片的网页的请求标头 print('请输入查找内容:') content = input() print('请输入查找图片数量:') number = int(input()) url = url1 + content + url2 #合并成去往有35张图片的网址 while whileshu: r0 = requests.get(url%urlshu,headers = head1).text #发送get请求并将内容转化成文本 soup = BeautifulSoup(r0,features="lxml").find_all('a','iusc') #解析r0并找到a和class=iusc的标签 data = str(soup[soupshu].get('m')) #将列表soup的第soupshu位中的m提取出来 zidian = json.loads(data) #将data转化成字典形式 ifopen = 1 #用于下方判断是否下载图片 with eventlet.Timeout(1,False): #设定1秒的超时判断 try: picture = requests.get(zidian['murl'],headers = head2).content #发送get请求并返回二进制数据 except: print('图片%d超时异常'%pictureshu) #说明图片异常 ifopen = 0 #取消下载此图片,否则会一直卡着然后报错 while ifopen == 1: text = open(save_path + '%d'%pictureshu + '.jpg','wb') #将图片下载至文件夹'图片'中 text.write(picture) #上行代码中'wb'为写入二进制数据 text.close() ifopen = 0 number = number - 1 pictureshu = pictureshu + 1 soupshu = soupshu + 1 whileshu = whileshu - 1 if whileshu == 0: #第一页图片下载完成,将网址的first进1 urlshu = urlshu + 1 whileshu = 35 soupshu = 0 if number == 0: #下载图片数量达标,退出循环 break
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/133614.html