时间:2020-10-11来源:www.pcxitongcheng.com作者:电脑系统城
思路:
1、将需要查询城市列表,通过城市接口转换成相应的code码
2、遍历城市、职位生成url
3、通过url获取列表页面信息,遍历列表页面信息
4、再根据列表页面信息的job_link获取详情页面信息,将需要的信息以字典data的形式存在列表datas里
5、判断列表页面是否有下一页,重复步骤3、4;同时将列表datas一直传递下去
6、一个城市、职位url爬取完后,将列表datas接在列表datas_list后面,重复3、4、5
7、最后将列表datas_list的数据,遍历写在Excel里面
知识点:
1、将response内容以json形式输出,解析json并取值
2、soup 的select()和find_all()和find()方法使用
3、异常Exception的使用
4、wldt创建编辑Excel的使用
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | importrequests, time, xlwtfrombs4 importBeautifulSoupclassMyJob():  def__init__(self, mycity, myquery):    self.city =mycity    self.query =myquery    self.list_url ="https://www.zhipin.com/job_detail/?query=%s&city=%s&industry=&position="%(self.query, self.city)    self.datas =[]    self.header ={      'authority': 'www.zhipin.com',      'method': 'GET',      'scheme': 'https',      'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',      'accept-encoding': 'gzip, deflate, br',      'accept-language': 'zh-CN,zh;q=0.9',      'cache-control': 'max-age=0',      'cookie': 'lastCity=101210100;uab_collina=154408714637849548916323;toUrl=/;c=1558272251;g=-;l=l=%2Fwww.zhipin.com%2Fuser%2Flogin.html&r=; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1555852331,1556985726,1558169427,1558272251; __a=40505844.1544087205.1558169426.1558272251.41.14.4.31; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1558272385',      'referer': 'https://www.zhipin.com/?ka=header-logo',      'upgrade-insecure-requests': '1',      'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'    }  #将城市转化为code码  defget_city(self,city_list):    city_url ="https://www.zhipin.com/wapi/zpCommon/data/city.json"#获取城市    json =requests.get(city_url).json()    zpData =json["zpData"]["cityList"]    list=[]    forcity incity_list :      fordata_sf inzpData:        fordata_dq indata_sf["subLevelModelList"]:          ifcity ==data_dq["name"]:             list.append(data_dq["code"])    returnlist  #获取所有页内容  defget_job_list(self, url, datas):    print(url)    html =requests.get(url, headers=self.header).text    soup =BeautifulSoup(html, 'html.parser')    jobs =soup.select(".job-primary")    forjob injobs:      data ={}      # 招聘id      data["job_id"] =job.find_all("div", attrs={"class": "info-primary"})[0].find("a").get("data-jobid")      # 招聘链接      data["job_link"] ="https://www.zhipin.com"+job.find_all("div", attrs={"class": "info-primary"})[0].find("a").get("href")      # 招聘岗位      data["job_name"] =job.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-title"}).get_text()      # 薪资      data["job_red"] =job.find_all("div", attrs={"class": "info-primary"})[0].find("span", attrs={"class": "red"}).get_text()      # 地址 #工作年限 #学历      data["job_address"] =job.find_all("div", attrs={"class": "info-primary"})[0].find("p").get_text().split(" ")      # 企业链接      data["job_company_link"] =job.find_all("div", attrs={"class": "info-company"})[0].find("a").get("href")      # 企业信息      data["job_company"] =job.find_all("div", attrs={"class": "info-company"})[0].find("p").get_text().split(" ")      # boss链接      data["job_publis_link"] =job.find_all("div", attrs={"class": "info-publis"})[0].find("img").get("src")      # boos信息      data["job_publis"] =job.find_all("div", attrs={"class": "info-publis"})[0].find("h3").get_text().split(" ")      time.sleep(5)      self.get_job_detail(data) # 获取job详情页内容      print(data)      datas.append(data) # 将某条job添加到datas中,直到将当前页添加完    try:      next_url =soup.find("div", attrs={"class": "page"}).find("a", attrs={"class": "next"}).get("href")      #if next_url[-1] =="3": # 第二页自动抛异常      ifnext_url in"javascript:;": # 最后一页自动抛异常        raiseException()    exceptException as e:      print("最后一页了;%s"%e)      returndatas # 返回所有页内容    else:      time.sleep(5)      next_url ="https://www.zhipin.com"+next_url      self.get_job_list(next_url, datas)      returndatas # 返回所有页内容  #获取详情页内容  defget_job_detail(self, data):    print(data["job_link"])    html =requests.get(data["job_link"], headers=self.header).text    soup =BeautifulSoup(html, 'html.parser')    # 招聘公司    data["detail_content_name"] =soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "name"}).get_text()    # 福利    data["detail_primary_tags"] =soup.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-tags"}).get_text().strip()    # 招聘岗位    data["detail_primary_name"] =soup.find_all("div", attrs={"class": "info-primary"})[0].find("h1").get_text()    # 招聘状态    data["detail_primary_status"] =soup.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-status"}).get_text()    # 薪资    data["detail_primary_salary"] =soup.find_all("div", attrs={"class": "info-primary"})[0].find("span", attrs={"class": "salary"}).get_text()    # 地址 #工作年限 #学历    data["detail_primary_address"] =soup.find_all("div", attrs={"class": "info-primary"})[0].find("p").get_text()    # 工作地址    data["detail_content_address"] =soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "location-address"}).get_text()    # 职位描述    data["detail_content_text"] =soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "text"}).get_text().strip().replace(";", "\n")    # boss名字    data["detail_op_name"] =soup.find_all("div", attrs={"class": "detail-op"})[1].find("h2", attrs={"class": "name"}).get_text()    # boss职位    data["detail_op_job"] =soup.find_all("div", attrs={"class": "detail-op"})[1].find("p", attrs={"class": "gray"}).get_text().split("·")[0]    # boss状态    data["detail_op_status"] =soup.find_all("div", attrs={"class": "detail-op"})[1].find("p", attrs={"class": "gray"}).get_text().split("·")[1]  #将获取的数据写入Excel  defsetExcel(self, datas_list):    book =xlwt.Workbook(encoding='utf-8')    table =book.add_sheet("boss软件测试")    table.write(0, 0, "编号")    table.write(0, 1, "招聘链接")    table.write(0, 2, "招聘岗位")    table.write(0, 3, "薪资")    table.write(0, 4, "地址")    table.write(0, 5, "企业链接")    table.write(0, 6, "企业信息")    table.write(0, 7, "boss链接")    table.write(0, 8, "boss信息")    table.write(0, 9, "detail详情")    i =1    fordata indatas_list:      table.write(i, 0, data["job_id"])      table.write(i, 1, data["job_link"])      table.write(i, 2, data["job_name"])      table.write(i, 3, data["job_red"])      table.write(i, 4, data["job_address"])      table.write(i, 5, data["job_company_link"])      table.write(i, 6, data["job_company"])      table.write(i, 7, data["job_publis_link"])      table.write(i, 8, data["job_publis"])      table.write(i, 10, data["detail_content_name"])      table.write(i, 11, data["detail_primary_name"])      table.write(i, 12, data["detail_primary_status"])      table.write(i, 13, data["detail_primary_salary"])      table.write(i, 14, data["detail_primary_address"])      table.write(i, 15, data["detail_content_text"])      table.write(i, 16, data["detail_op_name"])      table.write(i, 17, data["detail_op_job"])      table.write(i, 18, data["detail_op_status"])      table.write(i, 19, data["detail_primary_tags"])      table.write(i, 20, data["detail_content_address"])      i +=1    book.save(r'C:\%s_boss软件测试.xls'%time.strftime('%Y%m%d%H%M%S'))    print("Excel保存成功")if__name__ =='__main__':  city_list =MyJob("","").get_city(["杭州"])  query_list =["软件测试", "测试工程师"]  datas_list =[]  forcity incity_list:    forquery inquery_list:      myjob =MyJob(city, query)      datas =myjob.get_job_list(myjob.list_url, myjob.datas)      datas_list.extend(datas)  myjob.setExcel(datas_list) | 
以上就是python使用bs4爬取boss直聘静态页面的详细内容,
2023-03-17
python flask项目打包成docker镜像发布的过程2023-03-17
python调试模块ipdb详解2023-03-17
python使用openai生成图像的超详细教程python cron定时任务触发接口自动化巡检 apscheduler报错:Run time of job …… next run at: ……)” was missed by misfire_grace_time参数 找到任务超时的根本原因...
2023-03-15