R语言数据分析报告美国天气事件对人员伤亡和经济损失的影响 附代码数据.docx
《R语言数据分析报告美国天气事件对人员伤亡和经济损失的影响 附代码数据.docx》由会员分享,可在线阅读,更多相关《R语言数据分析报告美国天气事件对人员伤亡和经济损失的影响 附代码数据.docx(15页珍藏版)》请在冰豆网上搜索。
![R语言数据分析报告美国天气事件对人员伤亡和经济损失的影响 附代码数据.docx](https://file1.bdocx.com/fileroot1/2023-1/6/13ee657b-7405-4c9b-989c-bed81938733a/13ee657b-7405-4c9b-989c-bed81938733a1.gif)
R语言数据分析报告美国天气事件对人员伤亡和经济损失的影响附代码数据
R语言数据分析报告:
美国天气事件对人员伤亡和经济损失的影响
概要
这个分析的重点是回答两个问题:
1)在美国各地,哪类事件对人群健康危害最大;2)在整个美国,哪类事件具有最大的经济后果?
为了应对这些问题,使用美国国家气象局在1950年至2011年在美国所有州收集的数据进行了一些分析。
分析由两个主要维度构成:
时间和地理(在州一级)。
使用这两个维度作为支点,三个度量被汇总以按类型度量事件的影响;即:
a)人员伤亡;b)财产损失;和c)作物损失。
这些结果提供的见解,可能有助于地方长官采取预防措施,以减少在他们的地理区域盛行的天气事件的影响。
数据处理
#----------------------Setup------------------------#
library(dplyr)
library(ggplot2)
library(lubridate)
library(knitr)
#----------------------ConstantsDefinition------------------------#
RECENCY_SPAN_IN_YEARS<-10#LastXyearsTopEventsbyfrequency,bygeographicarea
C_NOT_DEFINED_STR<-'NOTDEFINED'
C_NOT_DEFINED_INT<--1
#----------------------阶段1:
加载源数据------------------------#
setwd('/Users/prosales/Documents/Capacitaciones/Certificaciones/CourseraDSCertificate-Course5-ReproducibleResearch/FinalProject/')
natural_events_df<-read.csv('repdata%2Fdata%2FStormData.csv.bz2')
state_geocodes_df<-read.csv('state-geocodes-v2015.csv')
##----------------------阶段2:
数据准备:
增强和重组----------------------------#
regions_df<-state_geocodes_df%>%filter(division==0&state_fips==0)%>%select(region,name)
colnames(regions_df)<-c('region_id','region_name')
divisions_df<-state_geocodes_df%>%filter(division!
=0&state_fips==0)%>%select(division,name)
colnames(divisions_df)<-c('division_id','division_name')
states_df<-state_geocodes_df%>%filter(state_fips!
=0)%>%select(region,division,state_fips,name)
colnames(states_df)<-c('region_id','division_id','state_id','state_name')
complete_geography_df<-merge(states_df,regions_df,by='region_id')
complete_geography_df<-merge(complete_geography_df,divisions_df,by='division_id')
complete_geography_df<-complete_geography_df%>%select('region_id','region_name','division_id','division_name','state_id','state_name')
geography_structured_events_df<-merge(natural_events_df,complete_geography_df,by.x='STATE__',by.y='state_id',all.x=TRUE)
geography_structured_events_df<-geography_structured_events_df%>%mutate(region_name=as.character(region_name))
geography_structured_events_df<-geography_structured_events_df%>%mutate(division_name=as.character(division_name))
geography_structured_events_df<-geography_structured_events_df%>%mutate(state_name=as.character(state_name))
geography_structured_events_df<-geography_structured_events_df%>%mutate(region_name=replace(region_name,is.na(region_name),C_NOT_DEFINED_STR))
geography_structured_events_df<-geography_structured_events_df%>%mutate(division_name=replace(division_name,is.na(division_name),C_NOT_DEFINED_STR))
geography_structured_events_df<-geography_structured_events_df%>%mutate(state_name=replace(state_name,is.na(state_name),C_NOT_DEFINED_STR))
geography_structured_events_df<-geography_structured_events_df%>%mutate(region_id=replace(region_id,is.na(region_id),C_NOT_DEFINED_INT))
geography_structured_events_df<-geography_structured_events_df%>%mutate(division_id=replace(division_id,is.na(division_id),C_NOT_DEFINED_INT))
geography_structured_events_df<-geography_structured_events_df%>%mutate(BGN_DATE=as.Date(BGN_DATE,format='%m/%d/%Y'))
##----------------------第三阶段:
按地理区域划分的频率,历史最高事件----------------------#
events_frequency_by_geography_df<-geography_structured_events_df%>%count(region_name,state_name,EVTYPE)
top_events_by_geography_df<-events_frequency_by_geography_df%>%group_by(region_name,state_name)%>%mutate(my_rank=rank(desc(n)))%>%filter(my_rank<=3)
top_events_by_geography_df<-top_events_by_geography_df[with(top_events_by_geography_df,order(region_name,state_name,my_rank)),]
max_dates_by_geography_df<-geography_structured_events_df%>%filter(!
is.na(BGN_DATE))%>%filter(is.Date(BGN_DATE))%>%group_by(region_name,state_name)%>%summarise(max_date=max(BGN_DATE))%>%mutate(event_date_lower_bound=max_date-years(RECENCY_SPAN_IN_YEARS))
last_X_years_events_by_geography_df<-merge(geography_structured_events_df,max_dates_by_geography_df,by.x=c('region_name','state_name'),by.y=c('region_name','state_name'))%>%filter(BGN_DATE>event_date_lower_bound)%>%select(region_name,state_name,EVTYPE,BGN_DATE,event_date_lower_bound)
last_X_years_events_frequency_by_geography_df<-last_X_years_events_by_geography_df%>%count(region_name,state_name,EVTYPE)
top_events_in_last_X_years_events_frequency_by_geography_df<-last_X_years_events_frequency_by_geography_df%>%group_by(region_name,state_name)%>%mutate(my_rank=rank(desc(n)))%>%filter(my_rank<=3)
top_events_in_last_X_years_events_frequency_by_geography_df<-top_events_in_last_X_years_events_frequency_by_geography_df[with(top_events_in_last_X_years_events_frequency_by_geography_df,order(region_name,state_name,my_rank)),]
##----------------------第四阶段:
地理区域致命事件----------------------#
fatalities_by_event_type_by_geography_df<-geography_structured_events_df%>%filter(!
is.na(FATALITIES))%>%group_by(region_name,state_name,EVTYPE)%>%summarise(sum(FATALITIES))
colnames(fatalities_by_event_type_by_geography_df)<-c('region_name','state_name','EVTYPE','fatalities_sum')
top_deadliest_events_types_by_geography_df<-fatalities_by_event_type_by_geography_df%>%mutate(my_rank=rank(desc(fatalities_sum)))%>%filter(my_rank<=3)
top_deadliest_events_types_by_geography_df<-top_deadliest_events_types_by_geography_df[with(top_deadliest_events_types_by_geography_df,order(region_name,state_name,my_rank)),]
#-#----------------------阶段5:
按地理区域造成大部分财产损失的事件类型----------------------#---------------------Stage5:
Eventstypesthatcausemostpropertylossesbygeographicarea----------------------#
property_losses_by_event_type_by_geography_df<-geography_structured_events_df%>%filter(!
is.na(PROPDMG))%>%group_by(region_name,state_name,EVTYPE)%>%summarise(sum(PROPDMG))
colnames(property_losses_by_event_type_by_geography_df)<-c('region_name','state_name','EVTYPE','property_losses_sum')
top_property_costly_events_types_by_geography_df<-property_losses_by_event_type_by_geography_df%>%mutate(my_rank=rank(desc(property_losses_sum)))%>%filter(my_rank<=3)
top_property_costly_events_types_by_geography_df<-top_property_costly_events_types_by_geography_df[with(top_property_costly_events_types_by_geography_df,order(region_name,state_name,my_rank)),]
top_property_costly_events_types_by_geography_df<-top_property_costly_events_types_by_geography_df%>%mutate(property_losses_sum=property_losses_sum/1000)
#----------------------Stage6:
Eventstypesthatcausemostcroplossesbygeographicarea----------------------#
crop_losses_by_event_type_by_geography_df<-geography_structured_events_df%>%filter(!
is.na(CROPDMG))%>%group_by(region_name,state_name,EVTYPE)%>%summarise(sum(CROPDMG))
colnames(crop_losses_by_event_type_by_geography_df)<-c('region_name','state_name','EVTYPE','crop_losses_sum')
top_crop_costly_events_types_by_geography_df<-crop_losses_by_event_type_by_geography_df%>%mutate(my_rank=rank(desc(crop_losses_sum)))%>%filter(my_rank<=3)
top_crop_costly_events_types_by_geography_df<-top_crop_costly_events_types_by_geography_df[with(top_crop_costly_events_types_by_geography_df,order(region_name,state_name,my_rank)),]
top_crop_costly_events_types_by_geography_df<-top_crop_costly_events_types_by_geography_df%>%mutate(crop_losses_sum=crop_losses_sum/1000)
#----------------------Stage7:
Eventsoccurencebygeographicareabymonth,duringthelastXyearsrecorded----------------------#
last_X_years_events_frequency_by_month_by_geography_df<-mutate(last_X_years_events_by_geography_df,event_month=month(BGN_DATE))%>%count(region_name,state_name,EVTYPE,event_month)
top_events_in_last_X_years_by_month_by_geography_df<-merge(last_X_years_events_frequency_by_month_by_geography_df,top_events_in_last_X_years_events_frequency_by_geography_df,by.x=c('state_name','EVTYPE'),by.y=c('state_name','EVTYPE'))
top_events_in_last_X_years_by_month_by_geography_df<-top_events_in_last_X_years_by_month_by_geography_df%>%select(region_name.y,state_name,EVTYPE,event_month,n.x)
colnames(top_events_in_last_X_years_by_month_by_geography_df)<-c('region_name','state_name','EVTYPE','event_month','n')
#----------------------Stage8:
Deadliesteventsbygeographicareabymonth----------------------#
fatalities_by_geography_by_event_type_by_month_df<-geography_structured_events_df%>%mutate(event_month=month(BGN_DATE))%>%filter(!
is.na(FATALITIES))%>%group_by(region_name,state_name,EVTYPE,event_month)%>%summarise(sum(FATALITIES))
colnames(fatalities_by_geography_by_event_type_by_month_df)<-c('region_name','state_name','EVTYPE','event_month','fatalities_sum')
top_fatalities_by_geography_by_event_type_by_month_df<-merge(fatalities_by_geography_by_event_type_by_month_df,top_deadliest_events_types_by_geography_df,by.x=c('state_name','EVTYPE'),by.y=c('state_name','EVTYPE'))
top_fatalities_by_geography_by_event_type_by_month_df<-top_fatalities_by_geography_by_event_type_by_month_df%>%select('region_name.x','state_name','EVTYPE','event_month','fatalities_sum.x')
colnames(top_fatalities_by_geography_by_event_type_by_month_df)<-c('region_name','state_name','EVTYPE','event_month','fatalities_sum')
#----------------------Stage9:
EventstypesthatcausemostPROPERTYlossesbygeographicarea,bymonth----------------------#
property_losses_by_geography_by_event_type_by_month_df<-geography_structured_events_df%>%mutate(event_month=month(BGN_DATE))%>%filter(!
is.na(PROPDMG))%>%group_by(region_name,state_name,EVTYPE,event_month)%>%summarise(sum(PROPDMG))
colnames(property_losses_by_geography_by_event_type_by_month_df)<-c('region_name