- 需要爬取的内容在匹配到的列表
- 公司名称:company_name
- 职位名称:job_name
- 薪资:salary
- 工作城市:work_city
- 经验要求:working_exp
- 学历要求:edu_level
- 福利:welfare
- 更新时间:update_date(此字段在接口数据内)
- 技能要求: skill_required (此字段在接口数据内)
- 按F12打开开发者工具
- 点击Network并刷新页面
- 点击开发者工具栏并按ztrl+f,在出现的搜索框内输入匹配列表的内容,按回车
- 左下为搜索结果,点击搜索结果即可定位结果所在文件
- 点击Preview,所有数据都可以在这里面查找
- 点击headers即可查看请求url,请求方式,headers请求头等数据
- 点击下一页,重复上述步骤,查看请求数据的变化
- 再次点击下一页,重复步骤,查看请求数据变化
- 可以看到请求url变化的是start参数,且参数呈90的倍数变化,假设第一页start为0,测试是否能得到数据,结果为可以
- 请求headers变化的是Referer参数的p参数,第一页为0,第二页为1…
编写代码
import execjs
import requests
import json
def get_data(pageCount):
'''获取数据(根据请求页数,从0开始)'''
# 构造请求头
headers = {
'Accept': 'application/json, text/plain, */*',
'Origin' : 'https://sou.zhaopin.com',
'Referer': 'https://sou.zhaopin.com/?p={}&jl=702&sf=0&st=0&kw=%E5%A4%A7%E6%95%B0%E6%8D%AE&kt=3'.format(pageCount),
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
# 生成id参数
#js = '''"f097795abafd429bb0b65846ac9944b7-" + (new Date()).valueOf() + "-" + parseInt(Math.random() * #1000000)'''
#url_id = execjs.eval(js)
# 构造请求url
#data_url = 'https://fe-api.zhaopin.com/c/i/sou?start={}&pageSize=90&cityId=702&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=大数据&kt=3&=0&_v=0.14145840&x-zp-page-request-id={}'.format(pageCount*90 if pageCount>0 else 0,url_id)
data_url = 'https://fe-api.zhaopin.com/c/i/sou?start={}&pageSize=90&cityId=702&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=大数据&kt=3&=0&_v=0.14145840&x-zp-page-request-id=2f309a3114434294a5171a3b045f4af7-1557801609159-586728'.format(pageCount*90 if pageCount>0 else 0)
response = requests.get(data_url,headers=headers).text
#print(response)
# json读取数据
data = json.loads(response)
# 工作信息在此处
data = data['data']['results']
# 如果数据存在,返回数据,如果数据不存在则返回None,停止爬取
if len(data) > 2:
return data
else:
return None
def save_to_json(data,file_name):
'''保存到json文件'''
data = json.dumps(data)
file_name = file_name+'.json'
with open(file_name,'w')as f:
f.write(data)
print(file_name+'保存完成')
def handel_data(data):
'''处理数据'''
result = []
for d in data:
work = {}
work['company_name'] = d['company']['name']
work['job_name'] = d['jobName']
work['salary'] = d['salary']
work['work_city'] = d['city']['display']
work['working_exp'] = d['workingExp']['name']
work['edu_level'] = d['eduLevel']['name']
work['welfare'] = d['jobTag']['searchTag']
work['update_date'] = d['updateDate']
work['skill_required'] = d['extractSkillTag']
result.append(work)
print(result)
return result
def thread_crawl(start,step):
'''多线程函数
参数:
start:开始页数
step:步长
'''
i = start
while True:
data = get_data(i)
if data == None:
break
i += step
data = handel_data(data)
result.extend(data)
if __name__ == '__main__':
# 普通爬取
result = []
i = 0
while True:
data = get_data(i)
if data == None:
break
i += 1
data = handel_data(data)
result.extend(data)
save_to_json(data=result,file_name='大数据职位')
# 多线程爬取
th1 = Thread(target=thread_crawl,args=(0,3))
th2 = Thread(target=thread_crawl,args=(1,3))
th3 = Thread(target=thread_crawl,args=(2,3))
th1.start()
th2.start()
th3.start()
while True:
if th1.isAlive() == False and th2.isAlive() == False and th3.isAlive() == False:
save_to_json(data=result,file_name='大数据职位')
break