zoukankan      html  css  js  c++  java
  • python 爬取获得github项目代码

      1 # # -*- coding:utf-8 -*-
      2 # @Time : 2021/7/22 22:04 
      3 # @Author : 周博
      4 # @File : test_1.py 
      5 # @博客园: https://www.cnblogs.com/smartisn/
      6 import requests
      7 from lxml import etree
      8 import sys
      9 from urllib import request
     10 import zipfile
     11 import os
     12 import time
     13 import Download_mysql_zip.mysql.SQL as MYSQL
     14 from selenium import webdriver
     15 from selenium.webdriver.common.by import By
     16 from selenium.webdriver.support import expected_conditions as EC
     17 from selenium.webdriver.support.wait import WebDriverWait
     18 from selenium.webdriver.chrome.options import Options
     19 def Get_whole_file(file):
     20     Lists_val=[]
     21     for root, dirs, files in os.walk(file):
     22         # root 表示当前正在访问的文件夹路径
     23         # dirs 表示该文件夹下的子目录名list
     24         # files 表示该文件夹下的文件list
     25         # 遍历文件
     26         for f in files:
     27             Lists_val.append(os.path.join(root, f))
     28         # # 遍历所有的文件夹
     29         # for d in dirs:
     30         #     print(os.path.join(root, d))
     31     return Lists_val
     32 def un_zip(zip_filename,des_dir):
     33     '''
     34     解压压缩包至des_dir指定文件夹
     35     :param zip_filename:输入的压缩包名字,例如a.zip
     36     :param des_dir: 解压到的位置:例如为  ./文件存储/
     37     :return:
     38     '''
     39     with zipfile.ZipFile(zip_filename, 'r') as zzz:
     40         # 捕捉错误并且 返回存在错误的 压缩包名称
     41         try:
     42             zzz.extractall(des_dir)
     43             print(zip_filename,"解压成功")
     44         except zipfile.BadZipFile:
     45             print("Error: 压缩文件不完整:",zip_filename)
     46 
     47 def DownLoad_mysql_(start,end):
     48     # 51-60
     49     URLS = MYSQL.select_url_html(start,end)
     50     for url_ in URLS:
     51         print("*******************")
     52         url=url_[0]
     53         print(url)
     54         file_name = url.split("/")[-1]
     55         try:
     56             strhtml = requests.get(url, timeout=7)  # Get方式获取网页数据
     57             tree = etree.HTML(strhtml.text)
     58             # //*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a
     59             print(tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a//@href'))
     60             href_down = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[2]/a//@href')[0]
     61             print("55555555555555555555555555555555555555555")
     62             print(href_down)
     63             href_down="https://github.com"+href_down
     64             print(href_down)
     65             print("./data/" + file_name + '.zip')
     66             request.urlretrieve(href_down, "./data/" + file_name + '.zip')
     67             print("下载成功")
     68         except Exception as e:
     69             print(e)
     70             continue
     71 if __name__=="__main__":
     72     # E:\pycharm\WorkPlace\.net_analyzer\DownLoad_GitHub\data\
     73     options = Options()
     74     # options.headless = True  # 禁止打开
     75     driver = webdriver.Chrome('D:\Program Apps\Google\Chrome\driver\chromedriver.exe',options=options)
     76     '''获取所有的列表'''
     77     for page in range(0,1):
     78         url = 'https://github.com/search?l=C%23&o=desc&p='+str(page)+'&q=C%23&s=stars&type=Repositories'
     79         print("*******************")
     80         print(url)
     81         strhtml = requests.get(url, timeout=7)
     82         tree = etree.HTML(strhtml.text)
     83         hreff = tree.xpath('//*[@id="js-pjax-container"]/div/div[3]/div/ul//div[@class="f4 text-normal"]//a//@href')
     84         for hh in hreff:
     85             try:
     86                 file_name=hh.replace("/","_")
     87                 hh="https://github.com"+hh
     88                 driver.get(hh)
     89                 time.sleep(2)
     90                 wait = WebDriverWait(driver, 20)
     91                 button1 = wait.until(EC.element_to_be_clickable((By.XPATH,
     92                                                                  '//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/summary')))
     93                 button1.click()
     94                 # //*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a
     95                 button2 =  wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a')))
     96                 button2.click()
     97                 print(hh,"——————下载成功")
     98             except Exception as e:
     99                 print(e)
    100                 continue
  • 相关阅读:
    初入angularJS [2]
    初入angularJS [1]
    ubuntu13.10 nginx
    Session对象详解[源于网络]
    二、Python变量
    一、计算机硬件及操作系统
    python进阶之装饰器之3如何利用装饰器强制函数上的类型检查
    python基础之闭包函数
    python进阶之装饰器之2.定义一个可接受参数的装饰器、如何定义一个属性可由用户修改的装饰器、定义一个能接受可选参数的装饰器
    python进阶之装饰器之1.如何定义一个基本的装饰器并使用,保留装饰器的元数据(原信息),逆向解得函数原信息
  • 原文地址:https://www.cnblogs.com/smartisn/p/15802808.html
Copyright © 2011-2022 走看看