I need the information of professors based on areas interest and university and the best source for this I think is google scholar. I tried to scrape google scholar profiles without SERP API, and I used of request package and then get a 429 error, I couldn't solve it then used URllib and it work well. But now I have two problems:
1_ I want to scrap next pages of profiles if there is, else break loop, How do I do it?, "in my code doesn't go next page and doesn't break the loop":
2_ I notice a problem that the query just returns a matching case, and doesn't return a substring, for ex. If I write exactly "Tropical_Forest_Ecology" then it returns a particular researcher - say "John Doe". But when I'm looking for ecology at Michigan State University, and as a result, it doesn't return "John Doe". How do I solve it that returns any word even in substring?
Code:
from bs4 import BeautifulSoup
import urllib
from time import sleep
def Scrape_profiles(label,university_name,country):
for uni in university_name:
params = {
"view_op": "search_authors", # author results
"mauthors": f'label:{label} "{uni}"', # search query
"hl": "en", # language
"astart": 0, # page number
"engine": "google_scholar_profiles",
}
paper_repos_dict = {
'name' : [],
'email' : [],
'affiliations' : [],
'Citation' : [],
'interests' : [],
'uni' : [],
'country' : []}
while True:
url = r"https://scholar.google.com/citations?"
data = urllib.parse.urlencode(params)
req = urllib.request.Request(url+data,headers={'cookie': '<my cookie>'})
resp = urllib.request.urlopen(req).read()
soup = BeautifulSoup(resp, 'html5lib')
proftags = soup.findAll("div", {"class": "gsc_1usr" })
for mytag in proftags:
paper_repos_dict['name'].append(mytag.find("h3", {"class": "gs_ai_name" }).text),
paper_repos_dict['email'].append(mytag.find("div", {"class": "gs_ai_eml" }).text),
paper_repos_dict['affiliations'].append(mytag.find("div", {"class": "gs_ai_aff" }).text),
paper_repos_dict['Citation'].append(mytag.find("div", {"class": "gs_ai_cby" }).text[8:]),
paper_repos_dict['interests'].append([item.text for item in mytag.findAll("a", {"class": "gs_ai_one_int"})]),
paper_repos_dict['uni'].append(university_name),
paper_repos_dict['country'].append(country)
sleep(0.2)
# if next page token
if soup.select_one("button.gs_btnPR")["onclick"]:
params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", str(soup.select_one("button.gs_btnPR")["onclick"])).group(1) # -> XB0HAMS9__8J
params["astart"] += 10
else:
break
return paper_repos_dict
label="Remote_Sensing"
university_name=["Michigan University"]
country = 'USA'
Scrape_profiles(label,university_name,"usa")

I solved my first question:
problem was in ["onclick"], i used of .get() rather than [] in BeautifulSoup