Python爬取王蛇属蛇类科普网站数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from lxml import etree
import re
import time
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


def mkdir(path):
"""
创建文件夹
:param path: 文件夹路径
:return:
"""
folder = os.path.exists(path)
if not folder:
os.makedirs(path)


def data_save(data):
"""
下载图片并保存图片对应信息
:param data:从网页爬取的元组格式数据
:return:
"""
photo_url_all = "https://sites.pitt.edu/~mcs2/herp/" + data[3]
# print(photo_url_all)
# 获取图片页面内容
resp_photo_page = requests.get(photo_url_all)
photo_page_content = resp_photo_page.text
photo_page_content_html = etree.HTML(photo_page_content)
# xpath获取图片页面内每张图片的下载地址
src = photo_page_content_html.xpath('/html/body/p/img/@src')
# 获取要存放每组图片的文件夹名称,并创建对应名称的文件夹
folder_name = (data[3].split("/")[-1]).split(".")[0]
mkdir("data/" + folder_name)
for itt in src:
# 逐张保存
src = "https://sites.pitt.edu/~mcs2/herp/" + itt
src_resp = requests.get(src)
photo_name = src.split("/")[-1]
with open("data/" + folder_name + '/' + photo_name, mode='wb') as f:
f.write(src_resp.content)
time.sleep(1)
# 创建每组图片的信息文本,并写入信息
with open("data/" + folder_name + '/' + folder_name + '.txt', mode='a') as ff:
ff.write("Scientific Name : %s\n" % data[0])
ff.write("Common Name : %s\n" % data[1])
ff.write("Adult Length : %s\n" % data[2])


if __name__ == '__main__':
url = "https://sites.pitt.edu/~mcs2/herp/Lampropeltis.html"
resp = requests.get(url)
page_content = resp.text
obj = re.compile(r'</td>.*?<tr>'
r'.*?<td nowrap valign = top>(?P<Scientific_Name>.*?)</td>'
r'.*?<td valign = top>(?P<Common_Name>.*?)</td>'
r'.*?<td nowrap align = right valign = top>(?P<Adult_Length>.*?)</td>'
r'.*?<td valign = top><IMG SRC=".*?ball.gif">\s?<A\s+HREF="(?P<Photo>.*?)">Photo</A>', re.S
)
# 返回迭代器
# result = obj.finditer(page_content)
# for it in result:
# print(it.group("Scientific_Name"))
# print(it.group("Common_Name"))
# print(it.group("Adult_Length"))
# print(it.group("Photo"))

# 返回列表
result = obj.findall(page_content)

# 多线程
# with ThreadPoolExecutor(30) as t:
# for n in tqdm(result, '数据获取中'):
# t.submit(data_save, n)
# print("数据获取完成")

# 单个依次获取
for n in tqdm(result, '数据获取中'):
data_save(n)
print("数据获取完成")