Read io.BytesIO with Pandas - Python

411 Views Asked by At

I am totally new in Python and I am following a google drive tutorial to be able to read a file from the cloud:

def download_file(real_file_id):

    try:
        service = authorize()
        file_id = real_file_id

        # pylint: disable=maybe-no-member
        request = service.files().get_media(fileId=file_id)
        file = io.BytesIO()
        downloader = MediaIoBaseDownload(file, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print(F'Download {int(status.progress() * 100)}.')

    except HttpError as error:
        print(F'An error occurred: {error}')
        file = None

    return file.seek(0)

Then I am trying to read this file with pandas read_csv method:

def read_file(item):
    with open(item, 'rb') as file:
        csvreader = pd.read_csv(file.read())
        print(csvreader)


if __name__ == '__main__':
    file = download_file("file_id")
    read_file(file)

I am sure the download method is working because is tested but when I try to read with pandas it just hang and nothing happens... I need to cancel the script manually and I got this errors:

Traceback (most recent call last):
  File "C:\Users\tluce\PycharmProjects\pythonProject\main.py", line 23, in <module>
    read_file(download_file(files[0]['id']))
  File "C:\Users\tluce\PycharmProjects\pythonProject\main.py", line 15, in read_file
    csvreader = pd.read_csv(file)
  File "C:\Users\tluce\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 912, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "C:\Users\tluce\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 577, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "C:\Users\tluce\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1407, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "C:\Users\tluce\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1679, in _make_engine
    return mapping[engine](f, **self.options)
  File "C:\Users\tluce\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 93, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas\_libs\parsers.pyx", line 548, in pandas._libs.parsers.TextReader.__cinit__
  File "pandas\_libs\parsers.pyx", line 637, in pandas._libs.parsers.TextReader._get_header
  File "pandas\_libs\parsers.pyx", line 848, in pandas._libs.parsers.TextReader._tokenize_rows
  File "pandas\_libs\parsers.pyx", line 859, in pandas._libs.parsers.TextReader._check_tokenize_status
  File "pandas\_libs\parsers.pyx", line 2025, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
1

There are 1 best solutions below

0
Naum Raskind On

First: seek() returns the new absolute position of the bytes io as an integer. It does not return the bytes io object. You will need to call file.seek(0) on one line, and then return file on the next. For reference: https://docs.python.org/3/library/io.html#io.IOBase.seek

Second: you can pass the BytesIO object directly into read_csv. Try replace your read_file function with:

def read_file(file_bytesio):
        csvreader = pd.read_csv(file_bytesio)
        print(csvreader)

Here's a short example for you try out yourself!

import io
import pandas as pd

df = pd.DataFrame(data=[
    ["cheeta", 100],
    ["snail", 1]
],
columns=["animal", "speed"])

with io.BytesIO() as csv_buffer:
    df.to_csv(path_or_buf=csv_buffer)
    csv_buffer.seek(0)

    new_df = pd.read_csv(filepath_or_buffer=csv_buffer)
    print(new_df)