Migration of International Families to and from Denmark¶
To recruit and retain an international workforce, Denmark also needs to be attractive to international families and not just single people.
This is a look into historical data from 2009-2024 to examine the trends of migration for people under the age of 20.
Assumptions¶
- People under the age of 18 less often migrate without their family, so we can infer a family by examining this group
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
df_migrant = pd.read_csv('youngmigrants.csv', header=None, encoding='ISO-8859-1')
df_immigrant = pd.read_csv('youngimmigration.csv', header=None, encoding='ISO-8859-1')
df_emigrant = pd.read_csv('youngemigrants.csv', header=None, encoding='ISO-8859-1')
df_citizens = pd.read_csv('youngcitizens.csv', header=None, encoding='ISO-8859-1')
# Function to assign column headers based on age, citizenship, and a year range
def assign_columns(df, start_year, end_year):
columns = ['Age', 'Citizenship'] + list(range(start_year, end_year + 1))
df.columns = columns
return df
# Apply the function to each dataframe with appropriate year ranges
df_migrant = assign_columns(df_migrant, 2010, 2024)
df_immigrant = assign_columns(df_immigrant, 2009, 2023)
df_emigrant = assign_columns(df_emigrant, 2009, 2023)
df_citizens = assign_columns(df_citizens, 2010, 2023)
df_migrant['Age'] = df_migrant['Age'].str.extract(r'(\d+)').astype(int)
df_immigrant['Age'] = df_immigrant['Age'].str.extract(r'(\d+)').astype(int)
df_emigrant['Age'] = df_emigrant['Age'].str.extract(r'(\d+)').astype(int)
df_citizens['Age'] = df_citizens['Age'].str.extract(r'(\d+)').astype(int)
# Create a new column for age group for each data frame
def assign_age_group(age):
if age <= 6:
return 'Preschool'
elif 7 <= age <= 15:
return 'School'
elif 16 <= age <= 18:
return 'High school'
else:
return 'Other' # Optional: handle ages outside the specified range
# Process each data frame
aggregated_frames = []
for df, metric in zip([df_migrant, df_immigrant, df_emigrant, df_citizens], ['Migrant', 'Immigrant', 'Emigrant', 'Citizen']):
# Drop the 2009 and 2024 columns
df = df.drop(columns=['2009', '2024'], errors='ignore')
# Create Age Group column and Metric column
df['Age Group'] = df['Age'].apply(assign_age_group)
df['Metric'] = metric
# Remove the "Age" column and group by "Citizenship", "Metric", and "Age Group"
df_grouped = df.drop(columns=['Age']).groupby(['Citizenship', 'Metric', 'Age Group']).sum(numeric_only=True).reset_index()
# Add the aggregated frame to the list
aggregated_frames.append(df_grouped)
# Concatenate the aggregated data frames
df_combined = pd.concat(aggregated_frames, ignore_index=True)
# Drop rows where all year columns are 0 (only the available year columns)
available_year_columns = [col for col in df_combined.columns if isinstance(col, int) or (isinstance(col, str) and col.isdigit())]
# Filter only the year columns within 2010 to 2023 range
available_year_columns = [col for col in available_year_columns if int(col) in range(2010, 2024)]
if available_year_columns:
df_combined = df_combined.loc[~(df_combined[available_year_columns] == 0).all(axis=1)]
# Drop columns by checking both string and integer versions
columns_to_drop = [2009, 2024, '2009', '2024']
df_combined = df_combined.drop(columns=[col for col in columns_to_drop if col in df_combined.columns], errors='ignore')
# Remove rows where "Age Group" is "Other"
df_combined = df_combined[df_combined['Age Group'] != 'Other']
# Define the list of years as integers
years = list(range(2010, 2024))
# Filter for "Migrant" metric
df_migrant_graph = df_combined[df_combined['Metric'] == 'Migrant']
# Group by Citizenship and sum across the year columns
df_grouped = df_migrant_graph.groupby('Citizenship')[years].sum()
# Sum across all year columns to get total migrants for each citizenship
df_grouped['Total'] = df_grouped[years].sum(axis=1)
# Select the top 10 citizenships based on the total number of migrant children
top_10_citizenships = df_grouped.nlargest(10, 'Total')
# Extract only the year columns for plotting
df_top10 = top_10_citizenships[years].T
# Convert year indices from integers (no need to change them)
df_top10.index = df_top10.index.astype(int)
# Use the 'viridis' colormap for better visual distinction
num_colors = len(df_top10.columns)
viridis_colors = cm.get_cmap('viridis', num_colors)(np.linspace(0, 1, num_colors))
# Re-plotting the stacked bar chart with corrected year indices
df_top10.plot(kind='bar', stacked=True, color=viridis_colors, figsize=(10, 7))
# Clean up the chart
plt.title('Top 10 Citizenship Migrant Children in Denmark')
plt.xlabel('Year')
plt.ylabel('Number of Migrant Children')
plt.xticks(rotation=45)
plt.legend(title='Citizenship', bbox_to_anchor=(1.05, 1), loc='upper left')
# Display the plot
plt.tight_layout()
plt.show()
/var/folders/hf/6hb2wp2j1fn707lw10cw1t4h0000gn/T/ipykernel_50645/4102712551.py:21: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
viridis_colors = cm.get_cmap('viridis', num_colors)(np.linspace(0, 1, num_colors))
Many international children come from war zones or otherwise unstable countries. Their families might not have much choice about staying.
# Filter for "Migrant" metric
df_emigrant_graph = df_combined[df_combined['Metric'] == 'Emigrant']
# Group by Citizenship and sum across the year columns
df_grouped = df_emigrant_graph.groupby('Citizenship')[years].sum()
# Sum across all year columns to get total migrants for each citizenship
df_grouped['Total'] = df_grouped[years].sum(axis=1)
# Select the top 10 citizenships based on the total number of migrant children
top_10_citizenships = df_grouped.nlargest(10, 'Total')
# Extract only the year columns for plotting
df_top10 = top_10_citizenships[years].T
# Convert year indices from integers (no need to change them)
df_top10.index = df_top10.index.astype(int)
# Use the 'viridis' colormap for better visual distinction
num_colors = len(df_top10.columns)
viridis_colors = cm.get_cmap('viridis', num_colors)(np.linspace(0, 1, num_colors))
# Re-plotting the stacked bar chart with corrected year indices
df_top10.plot(kind='bar', stacked=True, color=viridis_colors, figsize=(10, 7))
# Clean up the chart
plt.title('Top 10 Citizenships of Children who Emigrate from Denmark')
plt.xlabel('Year')
plt.ylabel('Number of Migrant Children')
plt.xticks(rotation=45)
plt.legend(title='Citizenship', bbox_to_anchor=(1.05, 1), loc='upper left')
# Display the plot
plt.tight_layout()
plt.show()
This shows that families are more likely to emigrate if they come from a more stable country. Ukraine is an outlier. A lot of Ukrainian families moved back to Ukraine or Germany after the initial crisis of 2022.
df_combined
| Citizenship | Metric | Age Group | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | Afghanistan | Migrant | High school | 1121 | 1270 | 1278 | 1267 | 1149 | 1112 | 1008 | 946 | 930 | 959 | 1028 | 1076 | 1041 | 1069 |
| 6 | Afghanistan | Migrant | Preschool | 1746 | 1805 | 1900 | 1980 | 2051 | 2125 | 2182 | 2252 | 2300 | 2302 | 2335 | 2360 | 2355 | 2463 |
| 7 | Afghanistan | Migrant | School | 2455 | 2442 | 2475 | 2463 | 2542 | 2641 | 2715 | 2781 | 2928 | 2993 | 2980 | 2983 | 3006 | 3176 |
| 8 | Africa not stated | Migrant | High school | 5 | 3 | 4 | 3 | 5 | 5 | 2 | 2 | 1 | 3 | 0 | 0 | 1 | 1 |
| 10 | Africa not stated | Migrant | Preschool | 6 | 5 | 8 | 6 | 9 | 3 | 6 | 5 | 6 | 2 | 1 | 2 | 2 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3834 | Zambia | Citizen | Preschool | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 3835 | Zambia | Citizen | School | 0 | 0 | 2 | 0 | 1 | 0 | 3 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 3836 | Zimbabwe | Citizen | High school | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3838 | Zimbabwe | Citizen | Preschool | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3839 | Zimbabwe | Citizen | School | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
1880 rows × 17 columns
# Step 1: Define G20 countries including EU member states
g20_countries = [
'Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'France', 'Germany', 'India',
'Indonesia', 'Italy', 'Japan', 'Mexico', 'Russia', 'Saudi Arabia', 'South Africa',
'South Korea', 'Turkey', 'United Kingdom', 'USA'
]
eu_member_states = [
'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
'Estonia', 'Finland', 'Greece', 'Hungary', 'Ireland', 'Latvia', 'Lithuania',
'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia',
'Slovenia', 'Spain', 'Sweden'
]
# Combine G20 countries and EU member states
g20_eu_countries = [country.lower() for country in (g20_countries + eu_member_states)]
# Step 2: Standardize the Citizenship column and filter the DataFrame
# Convert citizenship names to lowercase and strip whitespace for standardization
df_combined['Citizenship'] = df_combined['Citizenship'].str.lower().str.strip()
# Filter for G20 + EU countries
df_combined_filtered = df_combined[df_combined['Citizenship'].isin(g20_eu_countries)]
# Check if any data is left after filtering
if df_combined_filtered.empty:
print("Warning: No matching data found after filtering. The DataFrame is empty.")
else:
df_combined = df_combined_filtered
# Melt the dataframe to get 'Year' as a column instead of headers, keeping Citizenship as an identifier
melted_df = pd.melt(df_combined, id_vars=['Metric', 'Age Group', 'Citizenship'], var_name='Year', value_name='Value')
# Convert the 'Year' column to an integer for easier sorting
melted_df['Year'] = melted_df['Year'].astype(int)
# Reorder columns for a more convenient view
melted_df = melted_df[['Year', 'Metric', 'Citizenship', 'Age Group', 'Value']]
# Filter the data for each category
citizens_df = melted_df[melted_df['Metric'] == 'Citizen']
migrants_df = melted_df[melted_df['Metric'] == 'Migrant']
immigrants_df = melted_df[melted_df['Metric'] == 'Immigrant']
emigrants_df = melted_df[melted_df['Metric'] == 'Emigrant']
# Create function to get top 5 countries for each year
def get_top_n(df, n=5):
# Group by Year and Citizenship and sum the values
grouped = df.groupby(['Year', 'Citizenship'])['Value'].sum().reset_index()
# Create an empty dictionary to store the top n for each year
top_n_dict = {}
# Iterate over each year to find the top n countries
for year in grouped['Year'].unique():
top_n = grouped[grouped['Year'] == year].sort_values(by='Value', ascending=False).head(n)
top_n_dict[year] = top_n['Citizenship'].values
# Convert to DataFrame
top_n_df = pd.DataFrame(top_n_dict).T
top_n_df.columns = [f'Rank {i+1}' for i in range(n)]
return top_n_df
# Generate matrices for each category
top_5_citizens = get_top_n(citizens_df, n=5)
top_5_migrants = get_top_n(migrants_df, n=5)
top_5_immigrants = get_top_n(immigrants_df, n=5)
top_5_emigrants = get_top_n(emigrants_df, n=5)
# Display the top 5 matrices
#print("Top 5 Citizens per Year:")
#print(top_5_citizens)
#print("\nTop 5 Migrants per Year:")
#print(top_5_migrants)
#print("\nTop 5 Immigrants per Year:")
#print(top_5_immigrants)
#print("\nTop 5 Emigrants per Year:")
#print(top_5_emigrants)
# Function to get the top 5 countries across the entire year range
def get_top_n_total(df, n=5):
# Group by Citizenship and sum the values over all years
grouped = df.groupby('Citizenship')['Value'].sum().reset_index()
# Sort values in descending order and pick the top n
top_n = grouped.sort_values(by='Value', ascending=False).head(n)
return top_n
# Generate top 5 countries over the entire year range for each category
top_5_citizens_total = get_top_n_total(citizens_df, n=5)
top_5_migrants_total = get_top_n_total(migrants_df, n=5)
top_5_immigrants_total = get_top_n_total(immigrants_df, n=5)
top_5_emigrants_total = get_top_n_total(emigrants_df, n=5)
# Display the top 5 countries over the entire year range for each category
#print("Top 5 Citizens Over the Entire Year Range:")
#print(top_5_citizens_total)
#print("\nTop 5 Migrants Over the Entire Year Range:")
#print(top_5_migrants_total)
#print("\nTop 5 Immigrants Over the Entire Year Range:")
#print(top_5_immigrants_total)
#print("\nTop 5 Emigrants Over the Entire Year Range:")
#print(top_5_emigrants_total)
# Filter the dataframe to include only 'Total Citizens'
citizens_df = melted_df[melted_df['Metric'] == 'Citizen']
# Pivot the dataframe so that each Age Group becomes a column
citizens_pivot = citizens_df.pivot_table(index='Year', columns='Age Group', values='Value')
# Ensure the age groups are in the correct order for stacking
citizens_pivot = citizens_pivot[['Preschool', 'School', 'High school']]
# Create a stacked bar chart with green shades
ax = citizens_pivot.plot(
kind='bar',
stacked=True,
figsize=(10, 6),
color=['#66ff66', '#33cc33', '#009900']
)
# Add labels and title
ax.set_title('Children who became Danish citizens')
ax.set_ylabel('Number of New Danes')
ax.set_xlabel('Year')
# Rotate the x-axis labels
ax.tick_params(axis='x', rotation=45) # You can also try: ax.set_xticklabels(...)
# Display the plot
plt.tight_layout()
plt.show()
Families from G20 countries are not very likely to seek Danish citizenship for their child
# Filter the dataframe to include only 'Total Child Migrants'
migrants_df = melted_df[melted_df['Metric'] == 'Migrant']
# Pivot the dataframe so that each Age Group becomes a column
migrants_df_pivot = migrants_df.pivot_table(index='Year', columns='Age Group', values='Value')
# Ensure the age groups are in the correct order for stacking
migrants_df_pivot = migrants_df_pivot[['Preschool', 'School', 'High school']]
# Create a stacked bar chart with green shades
ax = migrants_df_pivot.plot(
kind='bar',
stacked=True,
figsize=(10, 6),
color=['#66ff66', '#33cc33', '#009900']
)
# Add labels and title
ax.set_title('Children from G20 Countries in Denmark')
ax.set_ylabel('Number of Migrants')
ax.set_xlabel('Year')
# Rotate the x-axis labels to 45 degrees
plt.xticks(rotation=45, ha="right")
# Display the plot
plt.tight_layout()
plt.show()
Look at how High School is staying steady even as the other two groups increase. International families are not sticking around for high school
# Filter the dataframe for Total Immigrants and Total Emigrants
immigration_df = melted_df[melted_df['Metric'] == 'Immigrant']
emigration_df = melted_df[melted_df['Metric'] == 'Emigrant']
# Pivot both dataframes to have years as index and age groups as columns
immigration_pivot = immigration_df.pivot_table(index='Year', columns='Age Group', values='Value', aggfunc='sum')
emigration_pivot = emigration_df.pivot_table(index='Year', columns='Age Group', values='Value', aggfunc='sum')
# Calculate net migration for each age group
net_migration_pivot = immigration_pivot - emigration_pivot
# Create Stacked Area Chart for Immigration and Emigration
plt.figure(figsize=(10, 6))
# Immigration - stacked above x-axis
plt.fill_between(immigration_pivot.index, immigration_pivot['Preschool'], label='Immigration - Preschool', color='#66ff66')
plt.fill_between(immigration_pivot.index, immigration_pivot['Preschool'] + immigration_pivot['School'], immigration_pivot['Preschool'], label='Immigration - School', color='#33cc33')
plt.fill_between(immigration_pivot.index, immigration_pivot['Preschool'] + immigration_pivot['School'] + immigration_pivot['High school'], immigration_pivot['Preschool'] + immigration_pivot['School'], label='Immigration - High school', color='#009900')
# Emigration - stacked below x-axis
plt.fill_between(emigration_pivot.index, -emigration_pivot['Preschool'], label='Emigration - Preschool', color='#ff6666')
plt.fill_between(emigration_pivot.index, -emigration_pivot['Preschool'] - emigration_pivot['School'], -emigration_pivot['Preschool'], label='Emigration - School', color='#cc3333')
plt.fill_between(emigration_pivot.index, -emigration_pivot['Preschool'] - emigration_pivot['School'] - emigration_pivot['High school'], -emigration_pivot['Preschool'] - emigration_pivot['School'], label='Emigration - High school', color='#990000')
# Add labels and title
plt.title('Immigration and Emigration of Children from G20 Countries')
plt.ylabel('Number of People')
plt.xlabel('Year')
plt.axhline(0, color='black', linewidth=0.8) # Line for x-axis
plt.xticks(rotation=45)
# Add legend
plt.legend(loc='upper right')
# Display the plot
plt.tight_layout()
plt.show()
# Plot the net migration for each age group as a stacked area chart
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the stacked area chart with "Year" as the x-axis for net migration
ax.stackplot(net_migration_pivot.index, net_migration_pivot['Preschool'], net_migration_pivot['School'], net_migration_pivot['High school'],
labels=['Pre-school', 'School', 'High school'], colors=['lightblue', 'blue', 'darkblue'])
# Set labels
ax.set_xlabel('Year')
ax.set_ylabel('Net Migration')
ax.set_title('Net Migration of Children from G20 Countries')
# Add legend
ax.legend(loc='upper left')
# Display the plot
plt.tight_layout()
plt.show()
You can see from the first graph that international families with preschoolers are immigrating AND emigrating in very high numbers. The net migration graph obscures this detail, also 2020 really messed up the stats.