import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

df_migrant = pd.read_csv('youngmigrants.csv', header=None, encoding='ISO-8859-1')
df_immigrant = pd.read_csv('youngimmigration.csv', header=None, encoding='ISO-8859-1')
df_emigrant = pd.read_csv('youngemigrants.csv', header=None, encoding='ISO-8859-1')
df_citizens = pd.read_csv('youngcitizens.csv', header=None, encoding='ISO-8859-1')

# Function to assign column headers based on age, citizenship, and a year range
def assign_columns(df, start_year, end_year):
    columns = ['Age', 'Citizenship'] + list(range(start_year, end_year + 1))
    df.columns = columns
    return df

# Apply the function to each dataframe with appropriate year ranges
df_migrant = assign_columns(df_migrant, 2010, 2024)
df_immigrant = assign_columns(df_immigrant, 2009, 2023)
df_emigrant = assign_columns(df_emigrant, 2009, 2023)
df_citizens = assign_columns(df_citizens, 2010, 2023)

df_migrant['Age'] = df_migrant['Age'].str.extract(r'(\d+)').astype(int)
df_immigrant['Age'] = df_immigrant['Age'].str.extract(r'(\d+)').astype(int)
df_emigrant['Age'] = df_emigrant['Age'].str.extract(r'(\d+)').astype(int)
df_citizens['Age'] = df_citizens['Age'].str.extract(r'(\d+)').astype(int)

# Create a new column for age group for each data frame
def assign_age_group(age):
    if age <= 6:
        return 'Preschool'
    elif 7 <= age <= 15:
        return 'School'
    elif 16 <= age <= 18:
        return 'High school'
    else:
        return 'Other'  # Optional: handle ages outside the specified range

# Process each data frame
aggregated_frames = []

for df, metric in zip([df_migrant, df_immigrant, df_emigrant, df_citizens], ['Migrant', 'Immigrant', 'Emigrant', 'Citizen']):
    # Drop the 2009 and 2024 columns
    df = df.drop(columns=['2009', '2024'], errors='ignore')

    # Create Age Group column and Metric column
    df['Age Group'] = df['Age'].apply(assign_age_group)
    df['Metric'] = metric

    # Remove the "Age" column and group by "Citizenship", "Metric", and "Age Group"
    df_grouped = df.drop(columns=['Age']).groupby(['Citizenship', 'Metric', 'Age Group']).sum(numeric_only=True).reset_index()

    # Add the aggregated frame to the list
    aggregated_frames.append(df_grouped)

# Concatenate the aggregated data frames
df_combined = pd.concat(aggregated_frames, ignore_index=True)

# Drop rows where all year columns are 0 (only the available year columns)
available_year_columns = [col for col in df_combined.columns if isinstance(col, int) or (isinstance(col, str) and col.isdigit())]

# Filter only the year columns within 2010 to 2023 range
available_year_columns = [col for col in available_year_columns if int(col) in range(2010, 2024)]

if available_year_columns:
    df_combined = df_combined.loc[~(df_combined[available_year_columns] == 0).all(axis=1)]

# Drop columns by checking both string and integer versions
columns_to_drop = [2009, 2024, '2009', '2024']
df_combined = df_combined.drop(columns=[col for col in columns_to_drop if col in df_combined.columns], errors='ignore')

# Remove rows where "Age Group" is "Other"
df_combined = df_combined[df_combined['Age Group'] != 'Other']

# Define the list of years as integers
years = list(range(2010, 2024))

# Filter for "Migrant" metric
df_migrant_graph = df_combined[df_combined['Metric'] == 'Migrant']

# Group by Citizenship and sum across the year columns
df_grouped = df_migrant_graph.groupby('Citizenship')[years].sum()

# Sum across all year columns to get total migrants for each citizenship
df_grouped['Total'] = df_grouped[years].sum(axis=1)

# Select the top 10 citizenships based on the total number of migrant children
top_10_citizenships = df_grouped.nlargest(10, 'Total')

# Extract only the year columns for plotting
df_top10 = top_10_citizenships[years].T

# Convert year indices from integers (no need to change them)
df_top10.index = df_top10.index.astype(int)

# Use the 'viridis' colormap for better visual distinction
num_colors = len(df_top10.columns)
viridis_colors = cm.get_cmap('viridis', num_colors)(np.linspace(0, 1, num_colors))

# Re-plotting the stacked bar chart with corrected year indices
df_top10.plot(kind='bar', stacked=True, color=viridis_colors, figsize=(10, 7))

# Clean up the chart
plt.title('Top 10 Citizenship Migrant Children in Denmark')
plt.xlabel('Year')
plt.ylabel('Number of Migrant Children')
plt.xticks(rotation=45)
plt.legend(title='Citizenship', bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the plot
plt.tight_layout()
plt.show()

/var/folders/hf/6hb2wp2j1fn707lw10cw1t4h0000gn/T/ipykernel_50645/4102712551.py:21: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  viridis_colors = cm.get_cmap('viridis', num_colors)(np.linspace(0, 1, num_colors))

# Filter for "Migrant" metric
df_emigrant_graph = df_combined[df_combined['Metric'] == 'Emigrant']

# Group by Citizenship and sum across the year columns
df_grouped = df_emigrant_graph.groupby('Citizenship')[years].sum()

# Sum across all year columns to get total migrants for each citizenship
df_grouped['Total'] = df_grouped[years].sum(axis=1)

# Select the top 10 citizenships based on the total number of migrant children
top_10_citizenships = df_grouped.nlargest(10, 'Total')

# Extract only the year columns for plotting
df_top10 = top_10_citizenships[years].T

# Convert year indices from integers (no need to change them)
df_top10.index = df_top10.index.astype(int)

# Use the 'viridis' colormap for better visual distinction
num_colors = len(df_top10.columns)
viridis_colors = cm.get_cmap('viridis', num_colors)(np.linspace(0, 1, num_colors))

# Re-plotting the stacked bar chart with corrected year indices
df_top10.plot(kind='bar', stacked=True, color=viridis_colors, figsize=(10, 7))

# Clean up the chart
plt.title('Top 10 Citizenships of Children who Emigrate from Denmark')
plt.xlabel('Year')
plt.ylabel('Number of Migrant Children')
plt.xticks(rotation=45)
plt.legend(title='Citizenship', bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the plot
plt.tight_layout()
plt.show()

df_combined

# Step 1: Define G20 countries including EU member states
g20_countries = [
    'Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'France', 'Germany', 'India',
    'Indonesia', 'Italy', 'Japan', 'Mexico', 'Russia', 'Saudi Arabia', 'South Africa',
    'South Korea', 'Turkey', 'United Kingdom', 'USA'
]

eu_member_states = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
    'Estonia', 'Finland', 'Greece', 'Hungary', 'Ireland', 'Latvia', 'Lithuania',
    'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia',
    'Slovenia', 'Spain', 'Sweden'
]

# Combine G20 countries and EU member states
g20_eu_countries = [country.lower() for country in (g20_countries + eu_member_states)]

# Step 2: Standardize the Citizenship column and filter the DataFrame
# Convert citizenship names to lowercase and strip whitespace for standardization
df_combined['Citizenship'] = df_combined['Citizenship'].str.lower().str.strip()

# Filter for G20 + EU countries
df_combined_filtered = df_combined[df_combined['Citizenship'].isin(g20_eu_countries)]

# Check if any data is left after filtering
if df_combined_filtered.empty:
    print("Warning: No matching data found after filtering. The DataFrame is empty.")
else:
    df_combined = df_combined_filtered

# Melt the dataframe to get 'Year' as a column instead of headers, keeping Citizenship as an identifier
melted_df = pd.melt(df_combined, id_vars=['Metric', 'Age Group', 'Citizenship'], var_name='Year', value_name='Value')

# Convert the 'Year' column to an integer for easier sorting
melted_df['Year'] = melted_df['Year'].astype(int)

# Reorder columns for a more convenient view
melted_df = melted_df[['Year', 'Metric', 'Citizenship', 'Age Group', 'Value']]

# Filter the data for each category
citizens_df = melted_df[melted_df['Metric'] == 'Citizen']
migrants_df = melted_df[melted_df['Metric'] == 'Migrant']
immigrants_df = melted_df[melted_df['Metric'] == 'Immigrant']
emigrants_df = melted_df[melted_df['Metric'] == 'Emigrant']

# Create function to get top 5 countries for each year
def get_top_n(df, n=5):
    # Group by Year and Citizenship and sum the values
    grouped = df.groupby(['Year', 'Citizenship'])['Value'].sum().reset_index()
    
    # Create an empty dictionary to store the top n for each year
    top_n_dict = {}

    # Iterate over each year to find the top n countries
    for year in grouped['Year'].unique():
        top_n = grouped[grouped['Year'] == year].sort_values(by='Value', ascending=False).head(n)
        top_n_dict[year] = top_n['Citizenship'].values

    # Convert to DataFrame
    top_n_df = pd.DataFrame(top_n_dict).T
    top_n_df.columns = [f'Rank {i+1}' for i in range(n)]
    return top_n_df

#  Generate matrices for each category
top_5_citizens = get_top_n(citizens_df, n=5)
top_5_migrants = get_top_n(migrants_df, n=5)
top_5_immigrants = get_top_n(immigrants_df, n=5)
top_5_emigrants = get_top_n(emigrants_df, n=5)

#  Display the top 5 matrices
#print("Top 5 Citizens per Year:")
#print(top_5_citizens)

#print("\nTop 5 Migrants per Year:")
#print(top_5_migrants)

#print("\nTop 5 Immigrants per Year:")
#print(top_5_immigrants)

#print("\nTop 5 Emigrants per Year:")
#print(top_5_emigrants)

# Function to get the top 5 countries across the entire year range
def get_top_n_total(df, n=5):
    # Group by Citizenship and sum the values over all years
    grouped = df.groupby('Citizenship')['Value'].sum().reset_index()

    # Sort values in descending order and pick the top n
    top_n = grouped.sort_values(by='Value', ascending=False).head(n)

    return top_n

#  Generate top 5 countries over the entire year range for each category
top_5_citizens_total = get_top_n_total(citizens_df, n=5)
top_5_migrants_total = get_top_n_total(migrants_df, n=5)
top_5_immigrants_total = get_top_n_total(immigrants_df, n=5)
top_5_emigrants_total = get_top_n_total(emigrants_df, n=5)

#  Display the top 5 countries over the entire year range for each category
#print("Top 5 Citizens Over the Entire Year Range:")
#print(top_5_citizens_total)

#print("\nTop 5 Migrants Over the Entire Year Range:")
#print(top_5_migrants_total)

#print("\nTop 5 Immigrants Over the Entire Year Range:")
#print(top_5_immigrants_total)

#print("\nTop 5 Emigrants Over the Entire Year Range:")
#print(top_5_emigrants_total)

# Filter the dataframe to include only 'Total Citizens'
citizens_df = melted_df[melted_df['Metric'] == 'Citizen']

# Pivot the dataframe so that each Age Group becomes a column
citizens_pivot = citizens_df.pivot_table(index='Year', columns='Age Group', values='Value')

# Ensure the age groups are in the correct order for stacking
citizens_pivot = citizens_pivot[['Preschool', 'School', 'High school']]

# Create a stacked bar chart with green shades
ax = citizens_pivot.plot(
    kind='bar', 
    stacked=True, 
    figsize=(10, 6), 
    color=['#66ff66', '#33cc33', '#009900']
)

# Add labels and title
ax.set_title('Children who became Danish citizens')
ax.set_ylabel('Number of New Danes')
ax.set_xlabel('Year')

# Rotate the x-axis labels
ax.tick_params(axis='x', rotation=45)  # You can also try: ax.set_xticklabels(...)

# Display the plot
plt.tight_layout()
plt.show()

# Filter the dataframe to include only 'Total Child Migrants'
migrants_df = melted_df[melted_df['Metric'] == 'Migrant']

# Pivot the dataframe so that each Age Group becomes a column
migrants_df_pivot = migrants_df.pivot_table(index='Year', columns='Age Group', values='Value')

# Ensure the age groups are in the correct order for stacking
migrants_df_pivot = migrants_df_pivot[['Preschool', 'School', 'High school']]

# Create a stacked bar chart with green shades
ax = migrants_df_pivot.plot(
    kind='bar', 
    stacked=True, 
    figsize=(10, 6), 
    color=['#66ff66', '#33cc33', '#009900']
)

# Add labels and title
ax.set_title('Children from G20 Countries in Denmark')
ax.set_ylabel('Number of Migrants')
ax.set_xlabel('Year')

# Rotate the x-axis labels to 45 degrees
plt.xticks(rotation=45, ha="right")

# Display the plot
plt.tight_layout()
plt.show()

#  Filter the dataframe for Total Immigrants and Total Emigrants
immigration_df = melted_df[melted_df['Metric'] == 'Immigrant']
emigration_df = melted_df[melted_df['Metric'] == 'Emigrant']

#  Pivot both dataframes to have years as index and age groups as columns
immigration_pivot = immigration_df.pivot_table(index='Year', columns='Age Group', values='Value', aggfunc='sum')
emigration_pivot = emigration_df.pivot_table(index='Year', columns='Age Group', values='Value', aggfunc='sum')

#  Calculate net migration for each age group
net_migration_pivot = immigration_pivot - emigration_pivot

#  Create Stacked Area Chart for Immigration and Emigration
plt.figure(figsize=(10, 6))

# Immigration - stacked above x-axis
plt.fill_between(immigration_pivot.index, immigration_pivot['Preschool'], label='Immigration - Preschool', color='#66ff66')
plt.fill_between(immigration_pivot.index, immigration_pivot['Preschool'] + immigration_pivot['School'], immigration_pivot['Preschool'], label='Immigration - School', color='#33cc33')
plt.fill_between(immigration_pivot.index, immigration_pivot['Preschool'] + immigration_pivot['School'] + immigration_pivot['High school'], immigration_pivot['Preschool'] + immigration_pivot['School'], label='Immigration - High school', color='#009900')

# Emigration - stacked below x-axis
plt.fill_between(emigration_pivot.index, -emigration_pivot['Preschool'], label='Emigration - Preschool', color='#ff6666')
plt.fill_between(emigration_pivot.index, -emigration_pivot['Preschool'] - emigration_pivot['School'], -emigration_pivot['Preschool'], label='Emigration - School', color='#cc3333')
plt.fill_between(emigration_pivot.index, -emigration_pivot['Preschool'] - emigration_pivot['School'] - emigration_pivot['High school'], -emigration_pivot['Preschool'] - emigration_pivot['School'], label='Emigration - High school', color='#990000')

# Add labels and title
plt.title('Immigration and Emigration of Children from G20 Countries')
plt.ylabel('Number of People')
plt.xlabel('Year')
plt.axhline(0, color='black', linewidth=0.8)  # Line for x-axis
plt.xticks(rotation=45)

# Add legend
plt.legend(loc='upper right')

# Display the plot
plt.tight_layout()
plt.show()

#  Plot the net migration for each age group as a stacked area chart
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the stacked area chart with "Year" as the x-axis for net migration
ax.stackplot(net_migration_pivot.index, net_migration_pivot['Preschool'], net_migration_pivot['School'], net_migration_pivot['High school'],
             labels=['Pre-school', 'School', 'High school'], colors=['lightblue', 'blue', 'darkblue'])

# Set labels
ax.set_xlabel('Year')
ax.set_ylabel('Net Migration')
ax.set_title('Net Migration of Children from G20 Countries')

# Add legend
ax.legend(loc='upper left')

# Display the plot
plt.tight_layout()
plt.show()

	Citizenship	Metric	Age Group	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022	2023
4	Afghanistan	Migrant	High school	1121	1270	1278	1267	1149	1112	1008	946	930	959	1028	1076	1041	1069
6	Afghanistan	Migrant	Preschool	1746	1805	1900	1980	2051	2125	2182	2252	2300	2302	2335	2360	2355	2463
7	Afghanistan	Migrant	School	2455	2442	2475	2463	2542	2641	2715	2781	2928	2993	2980	2983	3006	3176
8	Africa not stated	Migrant	High school	5	3	4	3	5	5	2	2	1	3	0	0	1	1
10	Africa not stated	Migrant	Preschool	6	5	8	6	9	3	6	5	6	2	1	2	2	3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3834	Zambia	Citizen	Preschool	1	0	1	0	0	0	0	0	0	0	1	1	0	0
3835	Zambia	Citizen	School	0	0	2	0	1	0	3	0	0	0	1	1	0	0
3836	Zimbabwe	Citizen	High school	0	0	1	0	0	0	0	0	0	0	0	0	0	0
3838	Zimbabwe	Citizen	Preschool	1	0	0	0	0	0	0	0	0	0	0	0	0	0
3839	Zimbabwe	Citizen	School	1	0	0	0	0	0	1	0	0	0	2	0	0	0

Migration of International Families to and from Denmark¶

Assumptions¶