import pandas as pd
import matplotlib.pyplot as plt

# Filter warnings
from warnings import filterwarnings
filterwarnings('ignore')


# Load the two datasets
dataset_1 = pd.read_csv('1900_2021_DISASTERS.xlsx - emdat data.csv')
dataset_2 = pd.read_csv('1970-2021_DISASTERS.xlsx - emdat data.csv')


# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# Display the first few rows of dataset_1
dataset_1.head()


# Display the first few rows of dataset_2
dataset_2.head()


# Identify common and unique columns between the two datasets
common_columns = set(dataset_1.columns).intersection(set(dataset_2.columns))
unique_to_dataset_1 = set(dataset_1.columns) - set(dataset_2.columns)
unique_to_dataset_2 = set(dataset_2.columns) - set(dataset_1.columns)

common_columns, unique_to_dataset_1, unique_to_dataset_2

({'Adm Level',
  'Admin1 Code',
  'Admin2 Code',
  'Aid Contribution',
  'Appeal',
  'Associated Dis',
  'Associated Dis2',
  'CPI',
  'Continent',
  'Country',
  'Declaration',
  'Dis Mag Scale',
  'Dis Mag Value',
  'Disaster Group',
  'Disaster Subgroup',
  'Disaster Subsubtype',
  'Disaster Subtype',
  'Disaster Type',
  'End Day',
  'End Month',
  'End Year',
  'Event Name',
  'Geo Locations',
  'Glide',
  'ISO',
  "Insured Damages ('000 US$)",
  'Latitude',
  'Local Time',
  'Location',
  'Longitude',
  'No Affected',
  'No Homeless',
  'No Injured',
  'OFDA Response',
  'Origin',
  'Region',
  'River Basin',
  'Seq',
  'Start Day',
  'Start Month',
  'Start Year',
  'Total Affected',
  "Total Damages ('000 US$)",
  'Total Deaths',
  'Year'},
 set(),
 {'Dis No', "Reconstruction Costs ('000 US$)"})


# Combining the datasets based on common columns
combined_dataset = pd.concat([dataset_1[list(common_columns)], dataset_2[list(common_columns)]], axis=0)

# Sorting the combined dataset based on Year and then by Seq (sequence number) for better organization
combined_dataset = combined_dataset.sort_values(by=['Year', 'Seq'])

# Resetting the index for the combined dataset
combined_dataset.reset_index(drop=True, inplace=True)

# Display the first few rows of the combined dataset
combined_dataset_head = combined_dataset.head()

combined_dataset_head


# Identifying columns with missing values and counting the number of missing values for each column
missing_values = combined_dataset.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

missing_values

Aid Contribution              29416
Associated Dis2               29365
Local Time                    28902
Disaster Subsubtype           28649
Insured Damages ('000 US$)    28580
River Basin                   28198
OFDA Response                 27626
Glide                         27608
No Homeless                   26091
Appeal                        25761
Latitude                      25710
Longitude                     25703
Declaration                   24387
Associated Dis                24190
Event Name                    23264
No Injured                    23224
Origin                        23196
Admin2 Code                   22836
Admin1 Code                   21608
Dis Mag Value                 21255
Total Damages ('000 US$)      20662
Adm Level                     15056
Geo Locations                 15056
No Affected                   12704
Total Deaths                   9158
Total Affected                 8112
Start Day                      6695
End Day                        6550
Disaster Subtype               5857
Location                       3138
Dis Mag Scale                  2263
End Month                      1257
Start Month                     655
CPI                             630
dtype: int64


# Yearly trend of number of disasters
yearly_disaster_counts = combined_dataset.groupby('Year').size()

# Yearly trend of total damages
yearly_total_damages = combined_dataset.groupby('Year')["Total Damages ('000 US$)"].sum()

plt.figure(figsize=(14, 7))

# Plotting number of disasters
plt.subplot(1, 2, 1)
yearly_disaster_counts.plot(color='blue')
plt.title('Yearly Trend of Number of Disasters', fontweight='bold')
plt.xlabel('Year', fontweight='bold')
plt.ylabel('Number of Disasters', fontweight='bold')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# Plotting total damages
plt.subplot(1, 2, 2)
yearly_total_damages.plot(color='red')
plt.title('Yearly Trend of Total Damages from Disasters', fontweight='bold')
plt.xlabel('Year', fontweight='bold')
plt.ylabel('Total Damages (in \'000 US$)', fontweight='bold')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


# Distribution of disaster types
disaster_type_counts = combined_dataset['Disaster Type'].value_counts()

plt.figure(figsize=(14, 7))
disaster_type_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Disaster Types', fontweight='bold')
plt.xlabel('Disaster Type', fontweight='bold')
plt.ylabel('Number of Occurrences', fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


# Extracting Canada-specific data
canada_data = combined_dataset[combined_dataset['Country'] == 'Canada']

# Distribution of disaster types in Canada
canada_disaster_type_counts = canada_data['Disaster Type'].value_counts()

plt.figure(figsize=(14, 7))
canada_disaster_type_counts.plot(kind='bar', color='cadetblue', edgecolor='black')
plt.title('Distribution of Disaster Types in Canada', fontweight='bold')
plt.xlabel('Disaster Type', fontweight='bold')
plt.ylabel('Number of Occurrences', fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


# Top 4 disaster types in Canada
top_4_disasters_canada = canada_disaster_type_counts.head(4).index.tolist()

# Creating subplots for yearly occurrences of the top 4 disaster types
plt.figure(figsize=(18, 12))

for i, disaster_type in enumerate(top_4_disasters_canada, 1):
    yearly_occurrences = canada_data[canada_data['Disaster Type'] == disaster_type].groupby('Year').size()
    
    plt.subplot(2, 2, i)
    yearly_occurrences.plot(color='dodgerblue')
    plt.title(f'Yearly Occurrences of {disaster_type} in Canada', fontweight='bold', fontsize=14)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Number of Occurrences', fontsize=12)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


# Yearly trend of economic damages in Canada due to natural disasters
yearly_damages_canada = canada_data.groupby('Year')["Total Damages ('000 US$)"].sum()

plt.figure(figsize=(14, 7))
yearly_damages_canada.plot(color='forestgreen')
plt.title('Yearly Trend of Economic Damages from Natural Disasters in Canada', fontweight='bold')
plt.xlabel('Year', fontweight='bold')
plt.ylabel('Economic Damage (in \'000 US$)', fontweight='bold')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


# Creating subplots for yearly economic damages of the top 4 disaster types in Canada
plt.figure(figsize=(18, 12))

for i, disaster_type in enumerate(top_4_disasters_canada, 1):
    yearly_damages_type = canada_data[canada_data['Disaster Type'] == disaster_type].groupby('Year')["Total Damages ('000 US$)"].sum()
    
    plt.subplot(2, 2, i)
    yearly_damages_type.plot(color='darkorchid')
    plt.title(f'Yearly Economic Damages from {disaster_type} in Canada', fontweight='bold', fontsize=14)
    plt.xlabel('Year', fontweight='bold', fontsize=12)
    plt.ylabel('Economic Damage (in \'000 US$)', fontweight='bold', fontsize=12)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


# Creating subplots for yearly deaths of the top 4 disaster types in Canada
plt.figure(figsize=(18, 12))

for i, disaster_type in enumerate(top_4_disasters_canada, 1):
    yearly_deaths_type = canada_data[canada_data['Disaster Type'] == disaster_type].groupby('Year')["Total Deaths"].sum()
    
    plt.subplot(2, 2, i)
    yearly_deaths_type.plot(color='crimson')
    plt.title(f'Yearly Deaths from {disaster_type} in Canada ({yearly_deaths_type.index.min()} - {yearly_deaths_type.index.max()})', fontweight='bold', fontsize=14)
    plt.xlabel('Year', fontweight='bold', fontsize=12)
    plt.ylabel('Number of Deaths', fontweight='bold', fontsize=12)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

	Year	Seq	Glide	Disaster Group	Disaster Subgroup	Disaster Type	Disaster Subtype	Disaster Subsubtype	Event Name	Country	ISO	Region	Continent	Location	Origin	Associated Dis	Associated Dis2	OFDA Response	Appeal	Declaration	Aid Contribution	Dis Mag Value	Dis Mag Scale	Latitude	Longitude	Local Time	River Basin	Start Year	Start Month	Start Day	End Year	End Month	End Day	Total Deaths	No Injured	No Affected	No Homeless	Total Affected	Insured Damages ('000 US$)	Total Damages ('000 US$)	CPI	Adm Level	Admin1 Code	Admin2 Code	Geo Locations
0	1900	9002	NaN	Natural	Climatological	Drought	Drought	NaN	NaN	Cabo Verde	CPV	Western Africa	Africa	Countrywide	NaN	Famine	NaN	NaN	No	No	NaN	NaN	Km2	NaN	NaN	NaN	NaN	1900	NaN	NaN	1900	NaN	NaN	11000.0	NaN	NaN	NaN	NaN	NaN	NaN	3.221647	NaN	NaN	NaN	NaN
1	1900	9001	NaN	Natural	Climatological	Drought	Drought	NaN	NaN	India	IND	Southern Asia	Asia	Bengal	NaN	NaN	NaN	NaN	No	No	NaN	NaN	Km2	NaN	NaN	NaN	NaN	1900	NaN	NaN	1900	NaN	NaN	1250000.0	NaN	NaN	NaN	NaN	NaN	NaN	3.221647	NaN	NaN	NaN	NaN
2	1902	12	NaN	Natural	Geophysical	Earthquake	Ground movement	NaN	NaN	Guatemala	GTM	Central America	Americas	Quezaltenango, San Marcos	NaN	Tsunami/Tidal wave	NaN	NaN	NaN	NaN	NaN	8.0	Richter	14	-91	20:20	NaN	1902	4.0	18.0	1902	4.0	18.0	2000.0	NaN	NaN	NaN	NaN	NaN	25000.0	3.350513	NaN	NaN	NaN	NaN
3	1902	3	NaN	Natural	Geophysical	Volcanic activity	Ash fall	NaN	Santa Maria	Guatemala	GTM	Central America	Americas	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1902	4.0	8.0	1902	4.0	8.0	1000.0	NaN	NaN	NaN	NaN	NaN	NaN	3.350513	NaN	NaN	NaN	NaN
4	1902	10	NaN	Natural	Geophysical	Volcanic activity	Ash fall	NaN	Santa Maria	Guatemala	GTM	Central America	Americas	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1902	10.0	24.0	1902	10.0	24.0	6000.0	NaN	NaN	NaN	NaN	NaN	NaN	3.350513	NaN	NaN	NaN	NaN

	Dis No	Year	Seq	Glide	Disaster Group	Disaster Subgroup	Disaster Type	Disaster Subtype	Disaster Subsubtype	Event Name	Country	ISO	Region	Continent	Location	Origin	Associated Dis	Associated Dis2	OFDA Response	Appeal	Declaration	Aid Contribution	Dis Mag Value	Dis Mag Scale	Latitude	Longitude	Local Time	River Basin	Start Year	Start Month	Start Day	End Year	End Month	End Day	Total Deaths	No Injured	No Affected	No Homeless	Total Affected	Reconstruction Costs ('000 US$)	Insured Damages ('000 US$)	Total Damages ('000 US$)	CPI	Adm Level	Admin1 Code	Admin2 Code	Geo Locations
0	1970-0013-ARG	1970	13	NaN	Natural	Hydrological	Flood	NaN	NaN	NaN	Argentina	ARG	South America	Americas	Mendoza	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Km2	NaN	NaN	NaN	NaN	1970	1.0	4.0	1970	1.0	4.0	36.0	NaN	NaN	NaN	NaN	NaN	NaN	25000.0	15.001282	NaN	NaN	NaN	NaN
1	1970-0109-AUS	1970	109	NaN	Natural	Meteorological	Storm	Tropical cyclone	NaN	Ada	Australia	AUS	Australia and New Zealand	Oceania	Queensland	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Kph	NaN	NaN	NaN	NaN	1970	1.0	NaN	1970	1.0	NaN	13.0	NaN	NaN	NaN	NaN	NaN	NaN	72475.0	15.001282	NaN	NaN	NaN	NaN
2	1970-0044-BEN	1970	44	NaN	Natural	Hydrological	Flood	NaN	NaN	NaN	Benin	BEN	Western Africa	Africa	Atacora region	NaN	NaN	NaN	Yes	NaN	NaN	NaN	NaN	Km2	NaN	NaN	NaN	NaN	1970	9.0	NaN	1970	9.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	200.0	15.001282	NaN	NaN	NaN	NaN
3	1970-0063-BGD	1970	63	NaN	Natural	Meteorological	Storm	Tropical cyclone	NaN	NaN	Bangladesh	BGD	Southern Asia	Asia	Khulna, Chittagong	NaN	NaN	NaN	Yes	NaN	NaN	NaN	NaN	Kph	NaN	NaN	NaN	NaN	1970	11.0	12.0	1970	11.0	12.0	300000.0	NaN	3648000.0	NaN	3648000.0	NaN	NaN	86400.0	15.001282	NaN	NaN	NaN	NaN
4	1970-0026-BGD	1970	26	NaN	Natural	Meteorological	Storm	NaN	NaN	NaN	Bangladesh	BGD	Southern Asia	Asia	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Kph	NaN	NaN	NaN	NaN	1970	4.0	13.0	1970	4.0	13.0	17.0	NaN	110.0	NaN	110.0	NaN	NaN	NaN	15.001282	NaN	NaN	NaN	NaN

	End Year	Location	Admin1 Code	Disaster Group	ISO	Event Name	Country	Aid Contribution	End Day	OFDA Response	Dis Mag Value	Start Year	Start Day	Total Damages ('000 US$)	Disaster Subgroup	Admin2 Code	Region	Start Month	Latitude	Continent	Glide	Appeal	Associated Dis	No Affected	Seq	Insured Damages ('000 US$)	Disaster Subsubtype	Geo Locations	Dis Mag Scale	Adm Level	Year	Disaster Subtype	CPI	Total Deaths	Total Affected	Longitude	Disaster Type	Declaration	No Injured	Associated Dis2	Local Time	River Basin	Origin	End Month	No Homeless
0	1900	Galveston (Texas)	NaN	Natural	USA	NaN	United States of America (the)	NaN	8.0	NaN	220.0	1900	8.0	30000.0	Meteorological	NaN	Northern America	9.0	NaN	Americas	NaN	NaN	Avalanche (Snow, Debris)	NaN	3	NaN	NaN	NaN	Kph	NaN	1900	Tropical cyclone	3.221647	6000.0	NaN	NaN	Storm	NaN	NaN	NaN	NaN	NaN	NaN	9.0	NaN
1	1900	Saint James	NaN	Natural	JAM	NaN	Jamaica	NaN	6.0	NaN	NaN	1900	6.0	NaN	Hydrological	NaN	Caribbean	1.0	NaN	Americas	NaN	NaN	NaN	NaN	6	NaN	NaN	NaN	Km2	NaN	1900	NaN	3.221647	300.0	NaN	NaN	Flood	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN
2	1900	Porus	NaN	Natural	JAM	Gastroenteritis	Jamaica	NaN	13.0	NaN	NaN	1900	13.0	NaN	Biological	NaN	Caribbean	1.0	33	Americas	NaN	NaN	NaN	NaN	7	NaN	NaN	NaN	Vaccinated	NaN	1900	Viral disease	3.221647	30.0	NaN	NaN	Epidemic	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN
3	1900	NaN	NaN	Natural	JPN	NaN	Japan	NaN	7.0	NaN	NaN	1900	7.0	NaN	Geophysical	NaN	Eastern Asia	7.0	NaN	Asia	NaN	NaN	NaN	NaN	8	NaN	NaN	NaN	NaN	NaN	1900	Ash fall	3.221647	30.0	NaN	NaN	Volcanic activity	NaN	NaN	NaN	NaN	NaN	NaN	7.0	NaN
4	1900	KARS,KARAKURT,KAGIZMAN,DIGOR	NaN	Natural	TUR	NaN	Turkey	NaN	12.0	NaN	6.0	1900	12.0	NaN	Geophysical	NaN	Western Asia	7.0	40.3	Asia	NaN	NaN	NaN	NaN	9	NaN	NaN	NaN	Richter	NaN	1900	Ground movement	3.221647	140.0	NaN	43.1	Earthquake	NaN	NaN	NaN	09:25	NaN	NaN	7.0	NaN

Natural Disasters Analysis - Yearly Trend¶

Authors: Women of the West Coast (WWC)¶

Date: Oct. 22, 2023¶

Table of Contents¶

Introduction¶

Data Processing¶

Dataset 1:¶

Dataset 2:¶

Combine datasets¶

To effectively combine and analyze the datasets, we'll need to:¶

Here's a summary of the columns in the two datasets:¶

Data Cleaning¶

Checking for Missing Values¶

Decision for Dealing with Missing Values¶

Global Analysis¶

Canada-Specific Analysis¶

Graph Interpretations:¶

Graph Interpretations:¶

Graph Interpretations:¶

Insights Summary¶

Conclusion¶