Getting none when trying to parse description tag of rss feed

26 Views Asked by At

so i'm acessing this rss feed enter image description here as you can see there is a description tag. when i'm parsing the feed it returns back none for the description tag

and this is the error message i get




AttributeError: 'NoneType' object has no attribute 'text'
Traceback:
File "C:\Users\User\Desktop\news-recommendation\env\lib\site-packages\streamlit\runtime\scriptrunner\script_runner.py", line 535, in _run_script
    exec(code, module.__dict__)
File "C:\Users\User\Desktop\news-recommendation\app.py", line 66, in <module>
    data=parseRSS('https://rss.app/feeds/6BJraU9Ff0IeqC3c.xml')
File "C:\Users\User\Desktop\news-recommendation\parseRSS.py", line 23, in parseRSS
    description_content = BeautifulSoup(item.description.text, "html.parser"

this is the code i'm using

    resp=requests.get(url)
    soup = BeautifulSoup(resp.content, features="xml")
    soup.prettify()
    items = soup.findAll('item')

    news_items = []
    for item in items:
        news_item={}
        news_item['title']=item.title.text
        news_item['link']=item.link.text
        news_item['pubDate']=item.pubDate.text
        news_items.append(news_item)
     
        description_content = BeautifulSoup(item.description.text, "html.parser")
        # Remove the img tag
        img_tag = description_content.find('img')
        if img_tag:
         img_tag.decompose()
    
         # Assuming you want to keep the rest of the content as HTML
        news_item['description'] = str(description_content)
1

There are 1 best solutions below

0
Andrej Kesely On

Some <item> don't have <description> tag so you need to handle that:

import requests
from bs4 import BeautifulSoup


def get_data(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, features="xml")
    soup.prettify()
    items = soup.findAll("item")

    news_items = []
    for item in items:
        news_item = {}
        news_item["title"] = item.title.text
        news_item["link"] = item.link.text
        news_item["pubDate"] = item.pubDate.text
        news_items.append(news_item)

        if item.description:
            description_content = BeautifulSoup(item.description.text, "html.parser")

            # Remove the img tag
            img_tag = description_content.find("img")
            if img_tag:
                img_tag.decompose()

            description_content = str(description_content)
        else:
            description_content = ""

        # Assuming you want to keep the rest of the content as HTML
        news_item["description"] = description_content

    return news_items


print(get_data("https://rss.app/feeds/6BJraU9Ff0IeqC3c.xml"))