1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| import requests from lxml import etree import re import time import os from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm
def mkdir(path): """ 创建文件夹 :param path: 文件夹路径 :return: """ folder = os.path.exists(path) if not folder: os.makedirs(path)
def data_save(data): """ 下载图片并保存图片对应信息 :param data:从网页爬取的元组格式数据 :return: """ photo_url_all = "https://sites.pitt.edu/~mcs2/herp/" + data[3] resp_photo_page = requests.get(photo_url_all) photo_page_content = resp_photo_page.text photo_page_content_html = etree.HTML(photo_page_content) src = photo_page_content_html.xpath('/html/body/p/img/@src') folder_name = (data[3].split("/")[-1]).split(".")[0] mkdir("data/" + folder_name) for itt in src: src = "https://sites.pitt.edu/~mcs2/herp/" + itt src_resp = requests.get(src) photo_name = src.split("/")[-1] with open("data/" + folder_name + '/' + photo_name, mode='wb') as f: f.write(src_resp.content) time.sleep(1) with open("data/" + folder_name + '/' + folder_name + '.txt', mode='a') as ff: ff.write("Scientific Name : %s\n" % data[0]) ff.write("Common Name : %s\n" % data[1]) ff.write("Adult Length : %s\n" % data[2])
if __name__ == '__main__': url = "https://sites.pitt.edu/~mcs2/herp/Lampropeltis.html" resp = requests.get(url) page_content = resp.text obj = re.compile(r'</td>.*?<tr>' r'.*?<td nowrap valign = top>(?P<Scientific_Name>.*?)</td>' r'.*?<td valign = top>(?P<Common_Name>.*?)</td>' r'.*?<td nowrap align = right valign = top>(?P<Adult_Length>.*?)</td>' r'.*?<td valign = top><IMG SRC=".*?ball.gif">\s?<A\s+HREF="(?P<Photo>.*?)">Photo</A>', re.S )
result = obj.findall(page_content)
for n in tqdm(result, '数据获取中'): data_save(n) print("数据获取完成")
|