How to save tweeter data in json file?

744 Views Asked by At

I am scraping the data from tweeter using Twython. I could get this done successfully. However, for further data manipulation, I need to save the tweeter data to JSON or any other format that can be opened with pandas.

I want to include every single column from the scraping result, including language location, retweets and so on. I know how to do this for a few columns, but I could not find the information about how to include all of them.

import json
credentials = {}
credentials['CONSUMER_KEY'] = '...'
credentials['CONSUMER_SECRET'] = '...'
credentials['ACCESS_TOKEN'] = '...'
credentials['ACCESS_SECRET'] = '...'

# Save the credentials object to file
with open("twitter_credentials.json", "w") as file:
    json.dump(credentials, file)

# Import the Twython class
from twython import Twython
import json

# Load credentials from json file
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

# Instantiate an object
python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])

python_tweets.search(q='#python', result_type='popular',count=5)

OUTPUT:
{'statuses': [{'created_at': 'Mon Dec 14 04:05:03 +0000 2020',
   'id': 1338334158205169664,
   'id_str': '1338334158205169664',
   'text': '  Hmmm...this looks right, doesn’t it? We’ll give you a hint - the result is meant to be 36!\n\nCan you find the err… ',
   'truncated': True,
   'entities': {'hashtags': [],
    'symbols': [],
    'user_mentions': [],
    'urls': [{'url': '',
      'expanded_url': '',
      'display_url': 'twitter.com/i/web/status/1…',
      'indices': [117, 140]}]},
   'metadata': {'result_type': 'popular', 'iso_language_code': 'en'},
   'source': '<a href=">',
   'in_reply_to_status_id': None,
   'in_reply_to_status_id_str': None,
   'in_reply_to_user_id': None,
   'in_reply_to_user_id_str': None,
   
and so on

My question is: how can I save the data I got from tweeter into json format so I can open it lately with pandas. I basically just want to open it with pandas somehow.

I have tried the following codes:

data= {}
data[python_tweets.search(q='#python', result_type='popular',count=5)]
with open("twitter_new.json", "w") as file:
    json.dump(data, file)

TypeError: unhashable type: 'dict'


data=python_tweets.search(q='#python', result_type='popular',count=5)
df = pd.DataFrame(data)

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.
1

There are 1 best solutions below

5
furas On

To save results from search() you should simply assing to variable data = ... and save it

data = client.search(q='#python', result_type='popular', count=5)

with open('tweets_python.json', 'w') as fh:
    json.dump(data, fh)

But this JSON has complex structure - it has two different subdictionares data['statuses'] and data['search_metadata'] which can't be converted together to one DataFrame. But probably you need only values from data['statuses'] (even without saving in file)

 df = pd.DataFrame(data['statuses'])
 print(df)

Result:

                       created_at                   id               id_str  ... retweeted  possibly_sensitive lang
0  Sun Dec 20 15:14:21 +0000 2020  1340676922230136833  1340676922230136833  ...     False               False   en
1  Sun Dec 20 04:12:58 +0000 2020  1340510479861616643  1340510479861616643  ...     False               False   en
2  Sun Dec 20 15:06:34 +0000 2020  1340674963452391426  1340674963452391426  ...     False               False   en
3  Mon Dec 14 04:05:03 +0000 2020  1338334158205169664  1338334158205169664  ...     False               False   en
4  Mon Dec 14 21:38:14 +0000 2020  1338599202125803521  1338599202125803521  ...     False               False   en

Minimal working code which I used to test it

from twython import Twython
import json
import pandas as pd

# --- credentials ---

with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)
CONSUMER_KEY    = creds['CONSUMER_KEY']
CONSUMER_SECRET = creds['CONSUMER_SECRET']

#import os
#CONSUMER_KEY    = os.getenv('TWITTER_KEY')
#CONSUMER_SECRET = os.getenv('TWITTER_SECRET')

# --- main ---

client = Twython(CONSUMER_KEY, CONSUMER_SECRET)

data = client.search(q='#python', result_type='popular', count=5)

#print(data.keys())  # 'statuses', 'search_metadata'

# save in JSON

with open('tweets_python.json', 'w') as fh:
    json.dump(data, fh)

# use directly with pandas

df = pd.DataFrame(data['statuses'])

print(df)

BTW:

Your dictionary data = {} could be useful if you would like to keep many results

data = {}

data['python'] = client.search(q='#python', ...)
data['php']    = client.search(q='#php', ...)
data['java']   = client.search(q='#java', ...)

and save it in separated JSON files

for key, value in data.items():
    filename = f'tweets_{key}.json'
    with open(filename, 'w') as fh:
        json.dump(value, fh)

or open in separated DataFrames

all_dfs = {}

for key, value in data.items():
    all_dfs[key] = pd.DataFrame(value['statuses'])

for key, df in all_dfs.items():
    print('dataframe for:', key)
    print(df)

Minimal working code which I used to test it

from twython import Twython
import json
import pandas as pd

# --- credentials ---

with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)
CONSUMER_KEY    = creds['CONSUMER_KEY']
CONSUMER_SECRET = creds['CONSUMER_SECRET']

#import os
#CONSUMER_KEY    = os.getenv('TWITTER_KEY')
#CONSUMER_SECRET = os.getenv('TWITTER_SECRET')

# --- main ---

client = Twython(CONSUMER_KEY, CONSUMER_SECRET)

data = {}

data['python'] = client.search(q='#python', result_type='popular', count=5)
data['php']    = client.search(q='#php', result_type='popular', count=5)
data['java']   = client.search(q='#java', result_type='popular', count=5)

# save in JSON

for key, value in data.items():
    filename = f'tweets_{key}.json'
    print('saving', filename)

    with open(filename, 'w') as fh:
        json.dump(value, fh)

# use directly with pandas

all_dfs = {}

for key, value in data.items():
    all_dfs[key] = pd.DataFrame(value['statuses'])

for key, df in all_dfs.items():
    print('dataframe for:', key)
    print(df)