In the code below I am scraping google search links with the help of Newpaper3k. However, the code fails whenever it comes across a link that is not scrapeable or otherwise. How to skip the website which cannot be scraped and mine the results for those links which can be scraped using the same code.

import pandas as pd
import time
!pip3 install newspaper3k
from googlesearch import search
import nltk
from newspaper import Article

newslist=[]
query=input("enter your query")
try:
    for i in search(query, tld="com",num=70, stop=70, pause=2,lang='en'):
     print(i)
     newslist.append(i)
    

    list_dataframe = pd.DataFrame(newslist)
    list_dataframe.reset_index(drop=True)
    df=list_dataframe
    df.rename(columns={ df.columns[0]: "Links" }, inplace = True)

 df=df.reset_index(drop=True)
    
    len=df.shape[0]
    date=[]
    image=[]
    Text=[]
    Summary=[]
    Keywords=[]
    url_links=[]

    i=0 
    nltk.download('punkt')  
    try:
    
     for i in range(0,(len)):
          # print(i)
          print(i)
          url=df['Links'][i]
          print(url)
          url_links.append(url)
          article=Article(url)
          article.download()
          article.parse()
          article.nlp()
          imag=article.top_image
          image.append(imag)
          Texxt=article.text
          Text.append(Texxt)
          Sumary=article.summary
          Summary.append(Sumary)
          Kewords=article.keywords
          Keywords.append(Kewords)
          i += 1
        
      
    except:
        print("error")


    data={'Links':url_links,'image':image,'Text':Text,'Summary':Summary, 'Keywords':Keywords}
    df1=pd.DataFrame(data)
    df1
    df1.to_csv('Table.csv',index = False)

 except:
    print("error")

I can manually insert a link removal code, along with elements of the website (as shown below) once I encounter an error, but the repeated manual process is cumbersome. Please help me find a way to continue the loop whenever a website link occurs which is not scrapeable and the results for the rest are as per the code.

df= df[~df.Links.str.contains('forbes')]
1

There are 1 best solutions below

0
Utkarsh Singh On

So I found a way to bypass the webpage which is not scrapeable and the results continue for the rest of the weblinks.

import pandas as pd
import time
!pip3 install newspaper3k
from googlesearch import search
import nltk
from newspaper import Article
newslist=[]
query=input("enter your query")
try:
    for i in search(query, tld="com",num=100, stop=100, pause=2,lang='en'):
     print(i)
     newslist.append(i)
    

    list_dataframe = pd.DataFrame(newslist)
    list_dataframe.reset_index(drop=True)
    df=list_dataframe
    df.rename(columns={ df.columns[0]: "Links" }, inplace = True)

    

    df=df.reset_index(drop=True)
    
    len=df.shape[0]
    len
    date=[]
    image=[]
    Text=[]
    Summary=[]
    Keywords=[]
    url_links=[]

    i=0 
    nltk.download('punkt')  
    try:
    
      for i in range(0,(len)):
          # print(i)
          print(i)
          url=df['Links'][i]
          print(url)
          try:
                  article=Article(url)
                  article.download()    
                  article.parse()
          except:
                  print("This link cannot be scraped.Trying next")
                  i+=1
                  url=df['Links'][i]
                  article=Article(url)
                  article.download()    
                  article.parse()    
                   
          url_links.append(url)
          article.nlp()
          imag=article.top_image
          image.append(imag)
          Texxt=article.text
          Text.append(Texxt)
          Sumary=article.summary
          Summary.append(Sumary)
          Kewords=article.keywords
          Keywords.append(Kewords)
          i += 1 
    except:
        print("error1")


    data={'Links':url_links,'image':image,'Text':Text,'Summary':Summary, 'Keywords':Keywords}
    df1=pd.DataFrame(data)
    df1
    df1.to_csv('Table.csv',index = False)

except:
    print("error2")

This works for any number of web queries.