不使用代理采集拉勾数据

不使用代理采集拉勾数据

#!/usr/bin/env python3  
# -*- coding: UTF-8 -*-  
"""  
@Project :all_daily_tasks_code @File    :job_another.py  
@Author  :木子  
@Date    :2024/2/22 10:10 """  

from DrissionPage import ChromiumOptions, ChromiumPage  

co = ChromiumOptions()  
# 无沙盒模式  
co.set_argument('--no-sandbox')  
# 禁止所有弹出窗口  
co.set_pref(arg='profile.default_content_settings.popups', value='0')  
# 隐藏是否保存密码的提示  
co.set_pref('credentials_enable_service', False)  
# 设置代理  
co.set_proxy('http://1077764809707376640:pMkeLLTz@http-dynamic.xiaoxiangdaili.com:10030')  
# # 设置无界面  
# co.headless(True)  
page = ChromiumPage(co)  

# 访问网页  
page.get("https://www.lagou.com")  
key_words = ["爬虫", "数据分析", "python"]  
city_list = ["北京", "上海", "武汉", "郑州", "广州", "深圳"]  
for city in city_list:  
    for key in key_words:  
        for i in range(1, 30):  
            re_u = f"https://www.lagou.com/wn/jobs?pn={i}&kd={key}&city={city}"  
            page.get(re_u)  
            import re  

            res = re.findall(r"<script id=\"__NEXT_DATA__\" type=\"application/json\">(.*?)</script>", page.html,  
                             re.MULTILINE)  
            if res:  
                infp = eval(res[0].replace("null", "None").replace("false", "False").replace("true", "True"))  
                print(infp)  
                import json  

                with open(f"{city}_{key}_{i}.json", "w", encoding="utf-8") as f:  
                    f.write(json.dumps(infp, ensure_ascii=False, indent=4))
LICENSED UNDER CC BY-NC-SA 4.0