自动化之跳过人机验证
由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:10 个月前
playwright
人机验证问题
在header里去掉了 User-Agent 后可以正常访问,加上就会引起网站的人机验证。
无人机验证
| 1 |  | 
有人机验证
| 1 |  | 
感觉这个人机验证也是个玄学啊,也可能跟网站有关系。
- 链接:https://mp.weixin.qq.com/s/LpsxegSY59zOoaPreyEnwQ
- 完整代码1 
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117import time, os
 import random
 import json
 import math
 from playwright.sync_api import sync_playwright, expect
 import pandas as pd
 def human_like_mouse_move(page, start_x, start_y, end_x, end_y):
 # 添加随机化的起始和结束点偏移
 start_x += random.randint(-10, 10)
 start_y += random.randint(-10, 10)
 end_x += random.randint(-10, 10)
 end_y += random.randint(-10, 10)
 # 随机化步数和非线性变化
 steps = random.randint(100, 200)
 curve_factor = random.uniform(0.1, 0.4)
 for i in range(steps + 1):
 t = i / steps
 x = start_x + (end_x - start_x) * t + random.uniform(-1, 1)
 y = start_y + (end_y - start_y) * (t ** curve_factor) + math.sin(t * math.pi) * random.randint(3, 7) + random.uniform(-1, 1)
 time.sleep(random.uniform(0.02, 0.05) if random.random() < 0.1 else random.uniform(0.005, 0.01))
 page.mouse.move(x, y)
 page.mouse.move(end_x, end_y)
 # def monitor_progress(page, progress_locator, hold_time):
 # start_time = time.time()
 # while time.time() - start_time < hold_time:
 # # 获取进度条的当前值
 # progress_value = page.locator(progress_locator).get_attribute('aria-valuenow')
 
 # if progress_value:
 # print(f"Current progress: {progress_value}%")
 
 # # 如果进度达到一定值,提前松开鼠标
 # if progress_value and int(progress_value) >= 100:
 # print("Progress complete, releasing mouse early")
 # break
 # # 等待片刻,继续检查进度
 # time.sleep(0.1)
 def solve_captcha(page):
 captcha_container = page.locator('.px-captcha-container')
 expect(captcha_container).to_be_visible(timeout=15000)
 button = page.locator('#px-captcha')
 expect(button).to_be_visible()
 button_box = button.bounding_box()
 start_x, start_y = random.randint(0, page.viewport_size['width']), random.randint(0, page.viewport_size['height'])
 end_x, end_y = button_box['x'] + button_box['width'] / 2, button_box['y'] + button_box['height'] / 2
 human_like_mouse_move(page, start_x, start_y, end_x, end_y)
 time.sleep(random.uniform(0.3, 0.6))
 page.mouse.down()
 hold_time = random.uniform(15, 20)
 start_time = time.time()
 while time.time() - start_time < hold_time:
 page.mouse.move(end_x + random.uniform(-1.5, 1.5), end_y + random.uniform(-1.5, 1.5))
 time.sleep(random.uniform(0.05, 0.15))
 # 模拟进度条监控,传入进度条元素的定位符,如 'div.progress-bar'
 # monitor_progress(page, 'div.progress-bar', hold_time=random.uniform(15, 25))
 page.mouse.up()
 time.sleep(random.uniform(30, 80))
 # time.sleep(5000)
 def parse_url(uname, max_retries=3):
 # url = f'https://www.niche.com/api/custom-site-search/?query={uname}&page=1&category=all'
 url = f'https://www.niche.com/api/sherlock-search/?c=30&q={uname}&t=u&s=&a=0'
 with sync_playwright() as p:
 browser = p.chromium.launch(
 channel="chrome", # 使用本地 Chrome 浏览器
 headless=False # 启用 GUI 模式
 )
 context = browser.new_context()
 page = context.new_page()
 # page.evaluate("() => delete navigator.webdriver") # 删除 `navigator.webdriver`
 page.add_init_script("""
 Object.defineProperty(navigator, 'webdriver', {
 get: () => undefined
 });
 """)
 for attempt in range(max_retries):
 response = page.goto(url, wait_until='networkidle')
 captcha = page.locator('.px-captcha-container')
 if captcha.count() > 0:
 print(f"CAPTCHA detected, attempting to solve... (Attempt {attempt + 1})")
 solve_captcha(page)
 if not page.locator('.px-captcha-container').is_visible():
 print("CAPTCHA solved successfully")
 break
 else:
 print("CAPTCHA solution failed, retrying...")
 time.sleep(random.uniform(5, 10))
 else:
 print("No CAPTCHA detected")
 break
 content = response.text()
 try:
 json_data = json.loads(content)
 # print(json.dumps(json_data, indent=2))
 with open(f'data/{uname}.json', 'w') as js:
 json.dump(json_data, js, indent=2)
 except json.JSONDecodeError:
 print("Response is not valid JSON")
 print("Response content:", content)
 browser.close()
 df_usnews = pd.read_excel('../usnews大学排名.xlsx').iloc[:300,:]
 for u in df_usnews['name']:
 if os.path.exists(f'data/{u}.json'):
 continue
 parse_url(u)
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!