I want to show 2 graphs one for the Top 10% income share held and for the bottom 10%, but i am running into the isue that only one grah is illustrated.
A sample of data for the poorest is: "#""Germany"",""DEU"",""Income share held by lowest 10%"",""SI.DST.FRST.10"","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""",""3.7"",""3.7"",""3.7"",""3.4"",""3.4"",""3.7"",""3.6"",""3.6"",""3.5"",""3.5"",""3.5"",""3.4"",""3.4"",""3.4"",""3.3"",""3.3"",""3.4"",""3.4"",""3.3"",""3.4"",""3.4"",""3.2"",""3.3"",""3.2"",""3.1"",""3.1"",""2.8"",""3.1"",""3.1"","""","""","""","
A sample of data for the wealthiest is: "Germany,""DEU"",""Income share held by highest 10%"",""SI.DST.10TH.10"","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""","""",""23.2"",""23.1"",""22.8"",""22.9"",""22.7"",""22.3"",""22.4"",""22.3"",""23.1"",""22.9"",""23.9"",""23.7"",""23.9"",""24"",""25.1"",""24.7"",""25.1"",""24.7"",""24"",""24"",""24.5"",""24.4"",""25"",""24.1"",""24.8"",""24.6"",""24.8"",""25.2"",""25.2"","""","""","""","
My updated Code is:
import csv
import matplotlib.pyplot as plt
def read_income_shares(file_name, wealthiest_file_name):
income_shares = {}
countries = []
years = []
# Read data from the first file
try:
with open(file_name, 'r', encoding='utf-8') as file:
reader = csv.reader(file, quoting=csv.QUOTE_NONE)
# Skip the first 4 rows
for _ in range(4):
next(reader)
# Read the header to get the years
header = next(reader)
years = [int(year.strip('"')) for year in header[4:] if year.strip('"').isdigit()]
for i, line in enumerate(reader, start=5):
try:
country_name = line[0].strip('"')
values = []
for val in line[4:]:
val = val.replace('""""', '0').replace('"', '').strip()
if val and val.replace('.', '').isdigit():
values.append(float(val))
else:
values.append(0)
income_shares.setdefault(country_name, {}).update({'Values_Poorest': values})
if country_name not in countries:
countries.append(country_name)
except Exception as e:
print(f"Error in line {i}: {e}")
print(f"Line content: {line}")
print(f"Warning: Unexpected data structure for {country_name}")
except FileNotFoundError:
print(f"Error: File '{file_name}' not found.")
except Exception as e:
print(f"Error: An unexpected error occurred: {e}")
# Read data from the second file
try:
with open(wealthiest_file_name, 'r', encoding='utf-8') as file:
reader = csv.reader(file, quoting=csv.QUOTE_NONE)
# Skip the first 4 rows
for _ in range(4):
next(reader)
for i, line in enumerate(reader, start=5):
try:
country_name = line[0].strip('"')
values_wealthiest = []
values_poorest = []
for val in line[4:]:
val = val.replace('""""', '0').replace('"', '').strip()
if val and val.replace('.', '').isdigit():
values_poorest.append(float(val))
else:
values_poorest.append(0)
if val and val.replace('.', '').isdigit():
values_wealthiest.append(float(val))
else:
values_wealthiest.append(0)
if country_name in income_shares:
income_shares[country_name]['Values_Poorest'] = values_poorest
else:
income_shares.setdefault(country_name, {}).update({'Values_Poorest': values_poorest})
if country_name not in countries:
countries.append(country_name)
if country_name in income_shares:
income_shares[country_name]['Values_Wealthiest'] = values_wealthiest
else:
income_shares.setdefault(country_name, {}).update({'Values_Wealthiest': values_wealthiest})
if country_name not in countries:
countries.append(country_name)
except Exception as e:
print(f"Error in line {i}: {e}")
print(f"Line content: {line}")
print(f"Warning: Unexpected data structure for {country_name}")
except FileNotFoundError:
print(f"Error: File '{wealthiest_file_name}' not found.")
except Exception as e:
print(f"Error: An unexpected error occurred: {e}")
return income_shares, countries, years
def plot_income_distribution(countries):
income_data_one, _, _ = read_income_shares('C:\\Users\\Fabian\\Desktop\\Python Ausarbeitung\\Bravo\\one.txt', 'C:\\Users\\Fabian\\Desktop\\Python Ausarbeitung\\Bravo\\two.txt')
income_data_two, _, _ = read_income_shares('C:\\Users\\Fabian\\Desktop\\Python Ausarbeitung\\Bravo\\two.txt', 'C:\\Users\\Fabian\\Desktop\\Python Ausarbeitung\\Bravo\\two.txt')
formatted_countries = [] # Collect formatted country names
for country in countries:
# Cleaning up the country name
country_formatted = country.strip('" \ufeff')
formatted_countries.append(country_formatted) # Collect formatted country names
# Check if data for the country is available in both files
if country_formatted in income_data_one and country_formatted in income_data_two:
# Add debugging print statements
print("Keys for {} in one.txt: {}".format(country_formatted, income_data_one[country_formatted].keys()))
print("Keys for {} in two.txt: {}".format(country_formatted, income_data_two[country_formatted].keys()))
# Process the data for one.txt
if 'Values_Poorest' in income_data_one[country_formatted]:
income_data_poorest_one = income_data_one[country_formatted]['Values_Poorest']
# Choose only Years from 1960 to 2022
years_to_plot = list(range(1960, 2023))
# Convert values to percentage
income_data_poorest_percent_one = [val for val in income_data_poorest_one]
# Filter out values equal to 0
non_zero_years_poorest_one = [year for year, val in zip(years_to_plot, income_data_poorest_percent_one) if val > 0]
non_zero_percentages_poorest_one = [val for val in income_data_poorest_percent_one if val > 0]
# Print the data for debugging
print("Years for {} in one.txt: {}".format(country_formatted, non_zero_years_poorest_one))
print("Poorest Percentages for {} in one.txt: {}".format(country_formatted, non_zero_percentages_poorest_one))
# Plot only if data is available for the year
plt.plot(non_zero_years_poorest_one, non_zero_percentages_poorest_one, label='{} - Poorest 10%'.format(country_formatted), linestyle='dashed')
# Process the data for two.txt
if 'Values_Wealthiest' in income_data_two[country_formatted]:
income_data_wealthiest_two = income_data_two[country_formatted]['Values_Wealthiest']
# Choose only Years from 1960 to 2022
years_to_plot = list(range(1960, 2023))
# Convert values to percentage
income_data_wealthiest_percent_two = [val for val in income_data_wealthiest_two]
# Filter out values equal to 0
non_zero_years_wealthiest_two = [year for year, val in zip(years_to_plot, income_data_wealthiest_percent_two) if val > 0]
non_zero_percentages_wealthiest_two = [val for val in income_data_wealthiest_percent_two if val > 0]
# Print the data for debugging
print("Years for {} in two.txt: {}".format(country_formatted, non_zero_years_wealthiest_two))
print("Wealthiest Percentages for {} in two.txt: {}".format(country_formatted, non_zero_percentages_wealthiest_two))
# Plot only if data is available for the year
plt.plot(non_zero_years_wealthiest_two, non_zero_percentages_wealthiest_two, label='{} - Wealthiest 10%'.format(country_formatted))
plt.title('Income Distribution Over Years')
plt.xlabel('Year')
plt.ylabel('Income Share (%)')
plt.ylim(0, 100) # Set the Y-axis to 0 to 100 percent
plt.axis([1960, 2022, 0, 100])
plt.grid(True)
# Display legend only if data is present.
if any(formatted_country in income_data_one or formatted_country in income_data_two for formatted_country in formatted_countries):
plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) # Move legend outside the plot area
plt.savefig('income_distribution_plot.png', bbox_inches='tight') # Save the plot as a PNG file
plt.show()
# Example call
countries_to_plot = ['"Germany"']
plot_income_distribution(countries_to_plot)
I could (partially) reproduce and fix.
First of all, your data file is broken and cannot be processed in its current formatting by the csv module. The csv module is great at processing complex data provided they respect the csv rules, and here, the quotes are not correctly balanced. As a result, each line is seen as a single quoted field which is not what you expect.
The correct way would be to fix your data files, but as a workaround you can ask the csv module to ignore any quotes and remove them from the data fields, what your code already does. Just open the readers with
quoting=csv.QUOTE_NONE(for both files...):That should be enough to correctly get the expected number of fields per row.
But you have a second problem: the files contains data in the range 0-100, but you multiply them by 100. As a result your data goes in the 0-10000 range and is plotted outside of the figure...
As a workaround, you can just use:
or directly processed the original values.
But after those two workarounds, I could get a plot.
What you should learn from that: your code already has some debugging prints. If you had added more, specially if you had printed the header line, you would have immediately understood that you only got one single field - that is what I have done...