I am trying to scrape >https://www.britishhorseracing.com/>
Specifically, I want to scrapy the result pages like: https://www.britishhorseracing.com/racing/results/fixture-results/!#/2020/468
The endgoal would be to have a scrapy splash crawler, that crawls the site and scrapes the information from every webpage that matches the "rule"
This site uses java - if I use Splash via http://localhost:8050/ I can render the site normally - but using a scraper, the !# causes problems
If I run the spider in cmd, I get the following output:
2023-10-10 14:51:41 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2023-10-10 14:51:41 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2023-10-10 14:51:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.britishhorseracing.com/racing/results/fixture-results/?_escaped_fragment_=%2F2023%2F11284> (referer: None)
2023-10-10 14:51:41 [scrapy.core.engine] INFO: Closing spider (finished)
with no data, because it seems the scraper tries to open the links as stated above, which leads to the wrong webpage (because of the "escaped fragment" part)
I have this code:
import re
import scrapy
from scrapy_splash import SplashRequest
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class BhacrawlerSpider(CrawlSpider):
name = 'bhacrawler'
allowed_domains = ['www.britishhorseracing.com']
start_urls = ['https://www.britishhorseracing.com/racing/results/fixture-results/#!/2023/11284']
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(15))
return {html=splash:html()}
end
'''
rules = (
Rule(
LinkExtractor(allow=('racing/results/fixture-results/!#', )),
callback='parse_race_details',
follow=True,
),
)
def parse_race_details(self, response):
print(response.text)
title_text = response.xpath('//title/text()').get()
print(title_text)
track = title_text.split('/')[0].strip()
date_match = re.search(r"(\w+) (\d{2}) (\w+) '(\d{2})", title_text)
if date_match:
day = date_match.group(2)
month_name = date_match.group(3)
year_short = date_match.group(4)
# Umwandeln von Monatsname in Monatsnummer
month_num = {
'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
}.get(month_name, '01') # Default zu Januar, falls nicht gefunden
formatted_date = f"20{year_short}-{month_num}-{day}" # "2023-10-03"
else:
formatted_date = None
# Extract the part of the Going Stick text
going_stick_text_full = response.xpath(
'//p[@class="meta-entry ng-scope" and contains(.,"Going Stick")]/span[@class="entry-body ng-binding"]/text()'
).get()
going_stick_value = re.search(r"Going Stick (\d+\.\d+)", going_stick_text_full).group(1) if going_stick_text_full else None
print(going_stick_text_full)
race_changes = response.xpath(
'//li[@class="category ng-scope"]/p[@class="meta-entry ng-scope" and @ng-if="race.distanceChange"]/span[@class="entry-body ng-binding"]'
).getall()
print(race_changes)
# Extract time, added yards and final distance
race_data = []
for idx, change in enumerate(race_changes):
time_match = re.search(r"<strong class=\"ng-binding\">(\d+:\d+[apm]*)", change)
time = time_match.group(1) if time_match else None
try:
time = time.strip('pm')
except:
pass
added_yards_match = re.search(r"\+(\d+y)", change)
added_yards = added_yards_match.group(1) if added_yards_match else None
try:
added_yards = added_yards.strip('y')
added_meters = int(added_yards) * 0.9144
except:
added_meters = 0
final_distance_match = re.search(r"to (.+)<", change)
final_distance = final_distance_match.group(1) if final_distance_match else None
#regex suchfilter
regex1 = re.search(r'\d{1,}(?=y)', final_distance)
regex2 = re.search(r'\d{1,}(?=f)', final_distance)
regex3 = re.search(r'\d{1,}(?=m)', final_distance)
if not regex1:
yards = 0
else:
yardstometers = 0.9144
regex1 = regex1.group()
regex1 = int(regex1)
yards = regex1 * yardstometers
if not regex2:
furlong = 0
else:
furlongtometers = 201.168
regex2 = regex2.group()
regex2 = int(regex2)
furlong = regex2 * furlongtometers
if not regex3:
miles = 0
else:
milestometers = 1609.34
regex3 = regex3.group()
regex3 = int(regex3)
miles = regex3 * milestometers
distancefinal = round(yards + miles + furlong, 1)
yield {
'url': response.url,
'track': track,
'date': formatted_date,
'Going Stick': going_stick_value,
'Race time': time,
'added_yards': added_meters,
'final_distance': distancefinal,
}
I assume the !X in the link is causing problems, any ideas how to circumvent the "#!" ? It seems scrapy just runs into an error page because it "escapes" the part of the url
How can I handle "hashbangs" in a url?