分享121个PHP源码,总有一款适合您

分享121个PHP源码,总有一款适合您php php 源码

大家好,欢迎来到IT知识分享网。

PHP源码

 Python采集代码下载链接:采集代码.zip – 蓝奏云

import os # 查找指定文件夹下所有相同名称的文件 def search_file(dirPath, fileName): dirs = os.listdir(dirPath) # 查找该层文件夹下所有的文件及文件夹,返回列表 for currentFile in dirs: # 遍历列表 absPath = dirPath + '/' + currentFile if os.path.isdir(absPath): # 如果是目录则递归,继续查找该目录下的文件 search_file(absPath, fileName) elif currentFile == fileName: print(absPath) # 文件存在,则打印该文件的绝对路径 os.remove(absPath)

分享121个PHP源码,总有一款适合您

 

    base_url =  "https://down.chinaz.com" # 采集的网址
    save_path = "D:\\Freedom\\Sprider\\ChinaZ\\"
    sprider_count = 121 # 采集数量
    sprider_start_count=7368 #正在采集第491页的第12个资源,共499页资源 debug



    word_content_list = []
    folder_name = ""
    page_end_number=0
    max_pager=15 #每页的数量
    haved_sprider_count =0  # 已经采集的数量
    page_count = 1  # 每个栏目开始业务content="text/html; charset=gb2312"
    filter_down_file=[]

    def sprider(self,title_name="NET"):

        """
       采集
       PHP https://down.chinaz.com/class/572_5_1.htm
       NET https://down.chinaz.com/class/572_4_1.htm
       ASP https://down.chinaz.com/class/572_3_1.htm
       Python https://down.chinaz.com/class/604_572_1.htm
            https://down.chinaz.com/class/608_572_1.htm
        微信 https://down.chinaz.com/class/610_572_1.htm
       Ruby   https://down.chinaz.com/class/622_572_1.htm
       NodeJs https://down.chinaz.com/class/626_572_1.htm
       C https://down.chinaz.com/class/594_572_1.htm
       :return:
       """
        if title_name == "PHP":
            self.folder_name = "PHP源码"
            self.second_column_name = "572_5"
        elif title_name == "Go":
            self.folder_name = "Go源码"
            self.second_column_name = "606_572"
        elif title_name == "NET":
            self.folder_name = "NET源码"
            self.second_column_name = "572_4"
        elif title_name == "ASP":
            self.folder_name = "ASP源码"
            self.second_column_name = "572_3"
        elif title_name == "Python":
            self.folder_name = "Python源码"
            self.second_column_name = "604_572"
        elif title_name == "JavaScript":
            self.folder_name = "JavaScript源码"
            self.second_column_name = "602_572"
        elif title_name == "Java":
            self.folder_name = "Java源码"
            self.second_column_name = "572_517"
        elif title_name == "HTML":
            self.folder_name = "HTML-CSS源码"
            self.second_column_name = "608_572"
        elif title_name == "TypeScript":
            self.folder_name = "TypeScript源码"
            self.second_column_name = "772_572"
        elif title_name == "微信小程序":
            self.folder_name = "微信小程序源码"
            self.second_column_name = "610_572"
        elif title_name == "Ruby":
            self.folder_name = "Ruby源码"
            self.second_column_name = "622_572"
        elif title_name == "NodeJs":
            self.folder_name = "NodeJs源码"
            self.second_column_name = "626_572"
        elif title_name == "C++":
            self.folder_name = "C++源码"
            self.second_column_name = "596_572"
        elif title_name == "C":
            self.folder_name = "C源码"
            self.second_column_name = "594_572"
        #https://down.chinaz.com/class/594_572_1.htm


        first_column_name = title_name # 一级目录
        self.sprider_category = title_name  # 一级目录
        second_folder_name = str(self.sprider_count) + "个" + self.folder_name #二级目录
        self.sprider_type =second_folder_name
        self.merchant=int(self.sprider_start_count) //int(self.max_pager)+1 #起始页码用于效率采集
        self.file_path = self.save_path + os.sep + "Code" + os.sep + first_column_name + os.sep + second_folder_name
        self.save_path = self.save_path+ os.sep + "Code" + os.sep+first_column_name+os.sep + second_folder_name+ os.sep + self.folder_name
        BaseFrame().debug("开始采集ChinaZCode"+self.folder_name+"...")
        sprider_url = (self.base_url + "/class/{0}_1.htm".format(self.second_column_name))
        down_path="D:\\Freedom\\Sprider\\ChinaZ\\Code\\"+first_column_name+"\\"+second_folder_name+"\\Temp\\"
        if os.path.exists(down_path) is True:
            shutil.rmtree(down_path)
        if os.path.exists(down_path) is False:
            os.makedirs(down_path)

        if os.path.exists(self.save_path ) is True:
            shutil.rmtree(self.save_path )
        if os.path.exists(self.save_path ) is False:
            os.makedirs(self.save_path )
        chrome_options = webdriver.ChromeOptions()
        diy_prefs ={'profile.default_content_settings.popups': 0,
                    'download.default_directory':'{0}'.format(down_path)}
        # 添加路径到selenium配置中
        chrome_options.add_experimental_option('prefs', diy_prefs)
        chrome_options.add_argument('--headless') #隐藏浏览器

        # 实例化chrome浏览器时,关联忽略证书错误
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_window_size(1280, 800)  # 分辨率 1280*800

        # driver.get方法将定位在给定的URL的网页,get接受url可以是任何网址,此处以百度为例
        driver.get(sprider_url)
        # content = driver.page_source
        # print(content)
        div_elem = driver.find_element(By.CLASS_NAME, "main")  # 列表页面 核心内容
        element_list = div_elem.find_elements(By.CLASS_NAME, 'item')

        laster_pager_ul = driver.find_element(By.CLASS_NAME, "el-pager")
        laster_pager_li =laster_pager_ul.find_elements(By.CLASS_NAME, 'number')
        laster_pager_url = laster_pager_li[len(laster_pager_li) - 1]
        page_end_number = int(laster_pager_url.text)
        self.page_count=self.merchant
        while self.page_count <= int(page_end_number):  # 翻完停止
            try:
                if self.page_count == 1:
                    self.sprider_detail(driver,element_list,self.page_count,page_end_number,down_path)
                    pass
                else:
                    if self.haved_sprider_count == self.sprider_count:
                        BaseFrame().debug("采集到达数量采集停止...")
                        BaseFrame().debug("开始写文章...")
                        self.builder_word(self.folder_name, self.word_content_list)
                        BaseFrame().debug("文件编写完毕,请到对应的磁盘查看word文件和下载文件!")
                        break
                    #(self.base_url + "/sort/{0}/{1}/".format(url_index, self.page_count))
                    #http://soft.onlinedown.net/sort/177/2/

                    next_url = self.base_url + "/class/{0}_{1}.htm".format(self.second_column_name, self.page_count)
                    driver.get(next_url)

                    div_elem = driver.find_element(By.CLASS_NAME, "main")  # 列表页面 核心内容
                    element_list = div_elem.find_elements(By.CLASS_NAME, 'item')
                    self.sprider_detail( driver, element_list, self.page_count, page_end_number, down_path)
                    pass
                #print(self.page_count)
                self.page_count = self.page_count + 1  # 页码增加1
            except Exception as e:
                print("sprider()执行过程出现错误:" + str(e))
                sleep(1)
 def sprider_detail(self, driver,element_list,page_count,max_page,down_path): """ 采集明细页面 :param driver: :param element_list: :param page_count: :param max_page: :param down_path: :return: """ index = 0 element_array=[] element_length=len(element_list) for element in element_list: url_A_obj = element.find_element(By.CLASS_NAME, 'name-text') next_url = url_A_obj.get_attribute("href") coder_title = url_A_obj.get_attribute("title") e=coder_title+"$"+ next_url element_array.append(e) pass if int(self.page_count) == int(self.merchant): self.sprider_start_index = int(self.sprider_start_count) % int(self.max_pager) index=self.sprider_start_index while index < element_length: if os.path.exists(down_path) is False: os.makedirs(down_path) if self.haved_sprider_count == self.sprider_count: BaseFrame().debug("采集到达数量采集停止...") break #element = element_list[index] element=element_array[index] time.sleep(1) index = index + 1 sprider_info="正在采集第"+str(page_count)+"页的第"+str(index)+"个资源,共"+str(max_page)+"页资源" BaseFrame().debug(sprider_info) next_url=element.split("$")[1] coder_title=element.split("$")[0] # next_url = element.find_element(By.TAG_NAME, 'a').get_attribute("href") # coder_title =element.find_element(By.TAG_NAME, 'img').get_attribute("title") try: codeEntity = SpriderEntity() # 下载过的资源不再下载 codeEntity.sprider_base_url = self.base_url codeEntity.create_datetime = SpriderTools.get_current_datetime() codeEntity.sprider_url = next_url codeEntity.sprider_pic_title = coder_title codeEntity.sprider_pic_index = str(index) codeEntity.sprider_pager_index = page_count codeEntity.sprider_type = self.sprider_type if SpriderAccess().query_sprider_entity_by_urlandindex(next_url, str(index)) is None: SpriderAccess().save_sprider(codeEntity) else: BaseFrame().debug(coder_title+next_url + "数据采集过因此跳过") continue driver.get(next_url) # 请求明细页面1 if SeleniumTools.judeg_element_isexist(driver, "CLASS_NAME", "download-item") == 3: driver.back() BaseFrame().debug(coder_title+"不存在源码是soft因此跳过哦....") continue print("准备点击下载按钮...") driver.find_element(By.CLASS_NAME, "download-item").click() #下载源码 sleep(1) result,message=SpriderTools.judge_file_exist(True,240,1,down_path,self.filter_down_file,"zip|rar|gz|tgz")#判断源码 if result is True: sprider_content = [coder_title, self.save_path + os.sep +"image"+ os.sep + coder_title + ".jpg"] # 采集成功的记录 self.word_content_list.append(sprider_content) # 增加到最终的数组 self.haved_sprider_count = self.haved_sprider_count + 1 BaseFrame().debug("已经采集完成第" + str(self.haved_sprider_count) + "个") time.sleep(1) driver.back() coder_title = str(coder_title).replace("::", "").replace("/", "").strip() #去掉windows不识别的字符 files = os.listdir(down_path) file_name = files[0] # 获取默认值 srcFile = down_path + os.sep + file_name file_ext = os.path.splitext(srcFile)[-1] dstFile = down_path + os.sep + coder_title + file_ext os.rename(srcFile, dstFile) srcFile = dstFile dstFile = self.save_path + os.sep + coder_title + file_ext shutil.move(srcFile, dstFile) # 移动文件 else: files = os.listdir(down_path) # 读取目录下所有文件 coder_title = str(coder_title).replace("/", "") # 去掉windows不识别的字符 try: if str(message)=="0个文件认定是False": BaseFrame().error(coder_title+"文件不存在...") shutil.rmtree(down_path) # 如果没下载完是无法删除的 pass else: BaseFrame().error("检测下载文件出错可能原因是等待时间不够已经超时,再等待60秒...") time.sleep(60) shutil.rmtree(down_path) #如果没下载完是无法删除的 #清空数组 self.filter_down_file.clear() except Exception as e: # 使用数组append记录文件名字 移动的时候过滤 self.builder_filter_file(files) pass except Exception as e: BaseFrame().error("sprider_detail()执行过程出现错误:" + str(e)) BaseFrame().error("sprider_detail()记录下载的文件名") # 使用数组append记录文件名字 移动的时候过滤 files = os.listdir(down_path) # 读取目录下所有文件 self.builder_filter_file(files) if(int(page_count)==int(max_page)): self.builder_word(self.folder_name,self.word_content_list) BaseFrame().debug("文件编写完毕,请到对应的磁盘查看word文件和下载文件!") 

分享121个PHP源码,总有一款适合您

 

最后送大家一首诗:

免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/101149.html

(0)
上一篇 2026-02-25 12:45
下一篇 2023-09-18 16:15

相关推荐

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

关注微信