scraping aspx web with python requests - payload parameters don't match

64 Views Asked by At

I'm trying to scrape this webpage:https://servicios.sbs.gob.pe/ReporteSituacionPrevisional/Afil_Consulta.aspx

enter image description here

The total process involves 3 requests, but I'm struggling with first one: enter image description here

Heres my current code:

import requests
from bs4 import BeautifulSoup
from pypasser import reCaptchaV3, reCaptchaV2
from urllib.parse import urlencode, quote

anchor = 'https://www.google.com/recaptcha/api2/anchor?ar=1&k=6LdR-KUZAAAAANbQ4LKiFpqcLzkNnJ_UanjRRLg0&co=aHR0cHM6Ly9zZXJ2aWNpb3Muc2JzLmdvYi5wZTo0NDM.&hl=en&v=Ya-Cd6PbRI5ktAHEhm9JuKEu&size=invisible&cb=pfvx1zeb6j90'
link = 'https://servicios.sbs.gob.pe/ReporteSituacionPrevisional/Afil_Consulta.aspx'

headers = {
  'authority': 'servicios.sbs.gob.pe',
  'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  'accept-language': 'en-US,en;q=0.9',
  'cache-control': 'no-cache',
  'content-type': 'application/x-www-form-urlencoded',
  #'cookie': 'visid_incap_2355492=k5UhEYBsQ+uJtNXuqL8K+YBXlGUAAAAAQUIPAAAAAADYyxuJseMy0g7445Emml5I; __utmz=197714925.1704243225.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); visid_incap_2391105=KSBMKTQSQn+Bq2oK5mWpNHnllGUAAAAAQUIPAAAAAAC813OvPFqUWRpqxkwnFJ4H; visid_incap_2473956=3bKBek8GT0O8Eq7wRLTnLfUJq2UAAAAAQUIPAAAAAAC2IRyfWwKToPwYbok1THxj; visid_incap_2471123=kRDo/tyPQdCF/0ao5jHxM/sJq2UAAAAAQUIPAAAAAAD3uqNTCrxZYMj6ALoDh0DA; _ga_ZSR8R7PS63=GS1.1.1705708022.1.0.1705708028.0.0.0; __utma=197714925.2059640209.1704220546.1705841954.1706019518.12; _gid=GA1.3.85138041.1706042690; _ga_M92VDMFGFM=GS1.3.1706067248.13.0.1706067248.0.0.0; _ga_XK0K3P9NPZ=GS1.1.1706239969.18.0.1706239969.0.0.0; dtCookie=v_4_srv_1_sn_2848AE6D1A3FBE316E41A9CE816B3987_perc_100000_ol_0_mul_1_app-3A195f854993e4a8e0_0; BIGipServerpool-servicios=856974532.20480.0000; TS013dbf92=019955ae1610604d7d692022dde2408ead730348a058a6ba60c521e6c3d0b31ad62a1a495454e49b8d2d51357d69601759ca6420ff; TS01120b42=019955ae1610604d7d692022dde2408ead730348a058a6ba60c521e6c3d0b31ad62a1a495454e49b8d2d51357d69601759ca6420ff; incap_ses_1722_2473956=HOI9BFCbwDQz8FsG7MXlF50WtWUAAAAA94bs5IP/YF27KaPGpJlZvw==; rxVisitor=17063666217950QGJKAIJQ9IJ2U6PPDBK2V33N42B0J8L; _gat_gtag_UA_39437019_4=1; _ga_C1JCYS9G8B=GS1.1.1706366622.15.1.1706366650.0.0.0; _ga=GA1.1.2059640209.1704220546; rxvt=1706368450791|1706366621810; dtPC=1$566649849_545h-vQGKSMNDJCEPLVOOFUMKGTKARICCSBKSU-0e0; dtLatC=1; dtSa=true%7CC%7C-1%7CBuscar%7C-%7C1706366664242%7C566649849_545%7Chttps%3A%2F%2Fservicios.sbs.gob.pe%2FReporteSituacionPrevisional%2FAfil_5FConsulta.aspx%7C%7C%7C%7C',
  'origin': 'https://servicios.sbs.gob.pe',
  'pragma': 'no-cache',
  'referer': 'https://servicios.sbs.gob.pe/ReporteSituacionPrevisional/Afil_Consulta.aspx',
  'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"Windows"',
  'sec-fetch-dest': 'document',
  'sec-fetch-mode': 'navigate',
  'sec-fetch-site': 'same-origin',
  'sec-fetch-user': '?1',
  'upgrade-insecure-requests': '1',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}

session = requests.Session()

r = session.get(link, headers=headers)
# cookies = requests.utils.dict_from_cookiejar(r.cookies)
# dtcookie = cookies['dtCookie']
recaptcha_response = reCaptchaV3(anchor)

soup = BeautifulSoup(r.text,"lxml")
payload = dict()

payload['__EVENTTARGET'] = "" # soup.select_one("#__EVENTTARGET")['value']
payload['__EVENTARGUMENT'] = "" # soup.select_one("#__EVENTARGUMENT")['value']
payload['__LASTFOCUS'] = "" # soup.select_one("#__LASTFOCUS")['value']
payload['__VIEWSTATE'] = soup.select_one("#__VIEWSTATE")['value']
# payload['__VIEWSTATE'] = '/wEPDwUJNjk3OTA2NjI2D2QWAmYPZBYCAgMPZBYCAgEPZBYEAgEPZBYGAgEPZBYEAgEPEA8WAh4HQ2hlY2tlZGhkZGRkAgMPEA8WAh8AZ2RkZGQCAw8PFgIeB1Zpc2libGVoZBYCAgEPEGRkFgFmZAIFDw8WAh8BZ2RkAgUPZBYCAgEPZBYCAkUPEGRkFgFmZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAwUmY3RsMDAkQ29udGVudFBsYWNlSG9sZGVyMSRSYWRpb0J1dHRvbjEFJmN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcjEkUmFkaW9CdXR0b24xBSZjdGwwMCRDb250ZW50UGxhY2VIb2xkZXIxJFJhZGlvQnV0dG9uMlSualpjUz9kmwVDBnA1wSIfTm7JrORIeQyZh9foGx6F'
payload['__VIEWSTATEGENERATOR'] = soup.select_one("#__VIEWSTATEGENERATOR")['value']
payload['__EVENTVALIDATION'] = soup.select_one("#__EVENTVALIDATION")['value']
# payload['__EVENTVALIDATION'] = '/wEdAAg80F4oOKXO8MwWW8n2gmlt7brr3MWKg9kJViOuirjIB9RwSLIqOtitwuZZJ78DEZKjXH07id5Yjcf4KQeV0GN+2eDC2PVjoIjpEKByAHs6fmVPAap64lWmNhT9mjtZzi+nr1li40HLpyp9vhCN0W5b4WP649c5OmkHuTWa9R5wpY3rZ51n/T0lzw7YPH0QF8p9o2VkDx8hDi0dq6D0NCOY'#soup.select_one("#__EVENTVALIDATION")['value']

payload['ctl00$ContentPlaceHolder1$Busx'] = 'RadioButton2'
payload['ctl00$ContentPlaceHolder1$txtAp_pat'] = 'NARRO'
payload['ctl00$ContentPlaceHolder1$txtAp_mat'] = 'LEON'
payload['ctl00$ContentPlaceHolder1$txtPri_nom'] = 'PATROCINIO'
payload['ctl00$ContentPlaceHolder1$txtSeg_nom'] = ''
payload['ctl00$ContentPlaceHolder1$btnBuscar'] = '  Buscar  '

payload['g-recaptcha-response'] = recaptcha_response

payload_encoded = "&".join([f"{quote(k, safe='')}={quote(v, safe='')}" for k, v in payload.items()])

res = session.post(link, data = payload_encoded, headers = headers)
res 

First, I used Postman to get the payload and headers, and I got a 'suspicious request' as response in Postman, so the connection was done correctly.

Then, with this I built the previous code, but I get a 404 error in HTML response. I think this is due to __VIEWSTATE and __EVENTVALIDATION parameters in payload, because if I comment the ones comming from soup.select_one, and instead use the fixes one commented in code (this were my parameetrs tested in Postman) then I get the same response in Postman. So for some reason my __VIEWSTATE and __EVENTVALIDATION parameters I get in first requests don't seem to correspond with I should get when inspect the webpage.

Any Idea to overcome this? I've tried many hours on this so far but don't make it work.

0

There are 0 best solutions below