UPDATE Thank you Tuppitappi (https://stackoverflow.com/users/10129894/tuppitappi) Your answer on this question: Gmail api python encode/decode error and using urlsafe_b64decode was EXACTLY what I needed!
All code below has been corrected
The complete code sample is below for a simple Python script that pulls down emails, parses them into a simple class list and then print out the results. Everything works fine until I get to the base64decoding part. Many outputs simply do not print at all and others will convert but then entire elements in the HTML body will be encoded. Has anybody else dealt with this before and, if so, is there a way to properly decode the email body text?
Here is the relevant code pulled out from below - I do grab the encoding from the message part
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode(newOne.charSet, 'backslashreplace')
except Exception as defaultError:
print("Error with default decoding: ", defaultError)
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode("iso8859_2", 'backslashreplace')
except Exception as isoError:
print("Error with ISO decoding: ", isoError)
try:
# UTF-8 is the default
newOne.body = base64.urlsafe_b64decode(newOne.body).decode('utf-8', 'backslashreplace')
except Exception as utfError:
print("Error with UTF decoding: ", utfError)
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode("ascii", 'backslashreplace')
except Exception as asciiError:
print("Error with ASCII decoding: ", asciiError)
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode("cp437", 'backslashreplace')
except Exception as oldSchoolError:
print("Error with Old School decoding: ", oldSchoolError)
newOne.body = newOne.body
Here is a sample of the error messages:
Error with default decoding: 'utf-8' codec can't decode byte 0xdc in position 184: invalid continuation byte
Error with default decoding: Invalid base64-encoded string: number of data characters (4701) cannot be 1 more than a multiple of 4
Error with ISO decoding: Invalid base64-encoded string: number of data characters (4701) cannot be 1 more than a multiple of 4
Error with ASCII decoding: Invalid base64-encoded string: number of data characters (4701) cannot be 1 more than a multiple of 4
Error with UTF decoding: Invalid base64-encoded string: number of data characters (4701) cannot be 1 more than a multiple of 4
Error with default decoding: Incorrect padding
Error with ISO decoding: Incorrect padding
Error with ASCII decoding: Incorrect padding
Error with UTF decoding: Incorrect padding
Error with default decoding: 'utf-8' codec can't decode byte 0xa0 in position 20: invalid start byte
Here is all the code Now corrected
import binascii
import os.path
from datetime import datetime
import base64
import dateutil.parser
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from typing import List
# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
class Attachment():
attachmentId: str
mimeType: str
fileName: str
class NewMail():
mailId: str
subject: str
sentTo: str
sentFrom: str
dateSent: datetime
body: str
contentType: str
charSet: str
attachments: List[Attachment]
def main():
"""Shows basic usage of the Gmail API.
Lists the user's Gmail labels.
"""
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists("token.json"):
creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
"credentials.json", SCOPES
)
creds = flow.run_local_server(port=56560)
# Save the credentials for the next run
with open("token.json", "w") as token:
token.write(creds.to_json())
# Call the Gmail API
service = build("gmail", "v1", credentials=creds)
try:
results = service.users().messages().list(userId="me",maxResults=10).execute()
messages = results.get("messages", [])
if not messages:
print("No message found.")
return
currentMail = list()
print("Messages:")
for message in messages:
newAttachments = list()
newOne = NewMail()
newOne.mailId = message.get("id")
newOne.body = ""
newOne.charSet = "utf_8"
thsMsg = service.users().messages().get(userId="me",id=message.get("id")).execute()
#First we process the Header for the main email
for header in thsMsg.get("payload").get("headers"):
if header["name"] == "Subject":
newOne.subject = header["value"]
elif header["name"] == "To":
newOne.sentTo = header["value"]
elif header["name"] == "From":
newOne.sentFrom = header["value"]
elif header["name"] == "To":
newOne.dateSent = dateutil.parser.parse(header["value"])
elif header["name"] == "Content-Type":
# We need to extract out the parts we want
typeParts = header["value"].split(";")
for typePart in typeParts:
if "charset" in typePart:
newOne.charSet = typePart.replace("charset=","")
elif "/" in typePart:
newOne.contentType = typePart
#Next we look to see if the body has anything in it
if thsMsg.get("payload").get("body") is not None and thsMsg.get("payload").get("body").get("data") is not None:
newOne.body = thsMsg.get("payload").get("body").get("data").strip()
#Finally we process the multipart - looking for attachments
if thsMsg.get("payload").get("parts") is not None:
for attachMe in thsMsg.get("payload").get("parts"):
if attachMe.get("filename") is not None and attachMe.get("filename") != "":
attachThis = Attachment()
attachThis.attachmentId = attachMe.get("partId"),
attachThis.mimeType = attachMe.get("mimType"),
attachThis.fileName = attachMe.get("filename"),
newOne.attachments.append(attachThis)
elif (newOne.body == "" and attachMe.get("body").get("data") is not None and
attachMe.get("body").get("data") != ""):
newOne.body = attachMe.get("body").get("data").strip()
#We also grab this version encoding and content type
for header in attachMe.get("headers"):
if header["name"] == "Content-Type":
# We need to extract out the parts we want
typeParts = header["value"].split(";")
for typePart in typeParts:
if "charset" in typePart:
newOne.charSet = typePart.replace("charset=", "")
elif "/" in typePart:
newOne.contentType = typePart
break
#GMAIL is base64 encoded so we need to decode it
if newOne.body != "":
try:
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode(newOne.charSet, 'backslashreplace')
except Exception as defaultError:
print("Error with default decoding: ", defaultError)
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode("iso8859_2", 'backslashreplace')
except Exception as isoError:
print("Error with ISO decoding: ", isoError)
try:
# UTF-8 is the default
newOne.body = base64.urlsafe_b64decode(newOne.body).decode('utf-8', 'backslashreplace')
except Exception as utfError:
print("Error with UTF decoding: ", utfError)
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode("ascii", 'backslashreplace')
except Exception as asciiError:
print("Error with ASCII decoding: ", asciiError)
try:
newOne.body = base64.urlsafe_b64decode(newOne.body).decode("cp437", 'backslashreplace')
except Exception as oldSchoolError:
print("Error with Old School decoding: ", oldSchoolError)
newOne.body = newOne.body
currentMail.append(newOne)
#Test print out to verify email content
for thisMail in currentMail:
print("To: " + thisMail.sentTo + "\n")
print("From: " + thisMail.sentFrom + "\n")
print("Subject: " + thisMail.subject + "\n")
if thisMail.body != "":
print("Body: " + thisMail.body)
else :
print("No email body")
print("---------------------------------------\n")
except HttpError as error:
# TODO(developer) - Handle errors from gmail API.
print(f"An error occurred: {error}")
finally:
service.close()
if __name__ == "__main__":
main()
I have tried every encoding I can think of but nothing seems to work. I went through https://docs.python.org/2.6/library/codecs.html#standard-encodings and tried different options but it does not make a difference. I copied the encoding from the email header but that did not help either. Each time the error calls the input string invalid or that it has an improper ending.