大家好,欢迎来到IT知识分享网。
一、数据来源
二、区划编码现成文件
1、获取方式:
- csdn:资源绑定
- v:JFAN0329
- 私信
2、文件部分内容展示
三、python部分代码分析
import time import requests from bs4 import BeautifulSoup import re import xlsxwriter def mainClass(): urlindex = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/' url = urlindex+'index.html' dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" } resp = requests.get(url, headers=dic) resp.encoding = 'zh-CN' page = BeautifulSoup(resp.text, "html.parser") httpCode = resp.status_code workbook = xlsxwriter.Workbook('F:\Study_file\python\getqgqhbm\ssxqybm.xlsx') worksheet = workbook.add_worksheet() # 如果请求失败,则休息2秒再次请求 if(httpCode != 200): while httpCode != 200: time.sleep(2) resp = requests.get(url, headers=dic) resp.encoding = 'zh-CN' page = BeautifulSoup(resp.text, "html.parser") httpCode = resp.status_code tr = page.find_all("tr",attrs={
"class": "provincetr"}) # 将属性写成一个字典,等同于”page.find("div", class_="img-list-tt hh")” i = 0; dataid = 1; oneid = 1; twoid = 1; treeid = 1; for td in tr: td_content = td.find_all("td") for a in td_content: a_tag = a.find_all("a") urlsxq = a_tag[0].get("href") # 直接获取 href 属性 qymctext = a_tag[0].text # 直接获取 text 属性 if urlsxq: # 确保 href 存在 url1 = urlindex + urlsxq; # 省级输出 worksheet.write(i, 0, dataid) worksheet.write(i, 2, qymctext) worksheet.write(i, 3, 0) worksheet.write(i, 4, 1) oneid = dataid; dataid = dataid+1 i = i+1 print(f"一级url:{
urlsxq}") print(f"一级名称:{
qymctext}") resp1 = requests.get(url1, headers=dic) resp1.encoding = 'zh-CN' httpCode1 = resp1.status_code if (httpCode1 != 200): while httpCode1 != 200: time.sleep(2) resp1 = requests.get(url, headers=dic) resp1.encoding = 'zh-CN' httpCode1 = resp1.status_code page1 = BeautifulSoup(resp1.text, "html.parser") table1 = page1.find_all("table", attrs={
"class": "citytable"}) city0ne = len(table1); for taTr in table1: tr1 = taTr.find_all("tr", attrs={
"class":"citytr"}) for tdTr in tr1: td1 = tdTr.find_all("a") url2 = td1[0].get("href") qhbm = td1[0].text qhmc = td1[1].text if url2 !='' and qhmc != '': worksheet.write(i, 0, dataid) worksheet.write(i, 1, qhbm) worksheet.write(i, 2, qhmc) worksheet.write(i, 3, oneid) worksheet.write(i, 4, 2) twoid = dataid dataid = dataid + 1 i=i+1 print(f"二级url:{
url2}") print(f"二级编码:{
qhbm}") print(f"二级名称:{
qhmc}") onebm = qhbm[0:2]+"000000000" worksheet.write(oneid-1, 1, onebm) workbook.close() if __name__ == '__main__': mainClass()
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/139774.html