import requests,wget,os,time from bs4 import BeautifulSoup from requests_html import HTMLSession from requests_file import FileAdapter headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3)' ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Connection': 'close'} pdf_dir = 'https://wikileaks.org/vault7/document/{0}/{1}.pdf' base_dir = 'https://wikileaks.org/vault7/document/' down_dir = "C:\\Users\\51257\\Desktop\\down\\{0}\\" pdf_test_dir = 'https://wikileaks.org/vault7/document/Protego_Release_' \ '01_05-Design_Docs-20141009-System_HW_Description/' \ 'Protego_Release_01_05-Design_Docs-20141009-System_HW_Description.pdf' '''先进行测试 ,是否能下载pdf ---可以下载''' #wget.download(pdf_test_dir, out = down_dir) '''要从base_dir里获得要分类的文件夹,然后建立文件夹''' session = HTMLSession() session.mount('file://', FileAdapter()) # 挂载文件 file_url = f'file:///C:/Users/51257/Desktop/WikiLeaks - Documents.html' r = session.get(file_url) html = r.html html_string = r.html.html # 返回网页的内容 session.close() soup= BeautifulSoup(html_string,'html.parser') release_section = soup.find('div',class_='release-section') h3_list = release_section.find_all('h3') for h3 in h3_list: h3_list[h3_list.index(h3)] =str(h3).replace('<h3>','').replace('</h3>','').replace('/','') print(h3_list) ul_list = release_section.find_all('ul') #print(ul_list) for ul in ul_list: h3_name = h3_list[ul_list.index(ul)] h3_dir = down_dir.format(h3_name) is_exist = os.path.exists(h3_dir) if not is_exist: os.mkdir(h3_dir) a_list = ul.find_all('a') for a in a_list: base_href = a['href'] base_name = base_href.split('/')[-2] tar_href = base_href+base_name+'.pdf' if is_exist: if not os.path.exists(h3_dir+base_name+'.pdf'): try: wget.download(tar_href,h3_dir) time.sleep(3) print('下载%s文件成功'%(h3_dir+base_name+'.pdf')) except Exception as err: print(err) print(tar_href) else: print('存在%s文件'%base_name)