爬虫基本步骤

1
2
3
4
5
6
7
8
9
10
11
import requests
url = 'https://www.google.com/' #以爬取谷歌页面为例
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537 36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58"
}
proxies={
'http':'http://127.0.0.1:10808' #依据本地代理端口填写地址
}
r = requests.get(url=url, headers=headers, proxies=proxies)
print(r.status_code) #200
print(r.text)

经验(●’◡’●)

  1. 静态页面一般可以直接通过get请求获得网页数据
  2. 需要js动态渲染的页面获取数据会稍微复杂一点,可以先查看返回的xhr或json文件是否有想要的数据(预览),或者ctrl+F搜索关键词
  3. 通过“发起程序”可以查看文件的请求程序发起链
  4. 爬虫的关键是抓包!找到我们感兴趣的数据!

应用案例1:爬取网站课件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import requests
import json
import os

url = "https://classroom.zju.edu.cn/pptnote/v1/schedule/search-ppt?course_id=49219&sub_id=848702&per_page=100"
headers = {
"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
}

page_num = 2
date = '20230615'
save_dir = 'D:/Desktop/ppt_img/' + date + '/'

for i in range(page_num):
params = {'page':i+1}
r = requests.get(url=url, headers=headers, params=params)
# print(r.status_code)
text = r.text
text_dict = json.loads(text)
ppt_list = text_dict['list']
print(len(ppt_list))
folder = os.path.exists(save_dir + str(i))
if not folder:
os.makedirs(save_dir + str(i))

for j in range(len(ppt_list)):
content = json.loads(ppt_list[j]['content'])
img_url = content['pptimgurl']
r = requests.get(img_url,headers=headers)
save_path = save_dir + str(i) + '/' + str(i) + '_' + str(j) + '.jpg'
with open(save_path, 'wb') as f:#把图片数据写入本地,wb表示二进制储存
f.write(r.content)

应用案例2:爬取Allen小鼠脑图谱

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import requests
import json
import os

url = "https://atlas.brain-map.org/atlasviewer/atlases/602630314/576985993.json"

headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58"
}

r = requests.get(url=url, headers=headers)
dict = json.loads(r.text)
num = len(dict['msg'][0]['section_images'])
img_dict = dict['msg'][0]['section_images']
# print(img_dict[130]['path'])
add_1 = "https://atlas.brain-map.org/cgi-bin/imageservice?path="
add_2 = "&mime=1&zoom=3&width=1140&height=800"

save_dir = "D:/Desktop/allen_img/"

for i in range(num):
path = img_dict[i]['path']
img_url = add_1 + path[:-4] + '_rendered.aff' + add_2
r = requests.get(img_url, headers=headers)
save_path = save_dir + str(i+1) + '.jpg'
with open(save_path, 'wb') as f:
f.write(r.content)