1、R语言数据分析报告美国天气事件对人员伤亡和经济损失的影响 附代码数据R语言数据分析报告:美国天气事件对人员伤亡和经济损失的影响 概要这个分析的重点是回答两个问题:1)在美国各地,哪类事件对人群健康危害最大; 2)在整个美国,哪类事件具有最大的经济后果? 为了应对这些问题,使用美国国家气象局在1950年至2011年在美国所有州收集的数据进行了一些分析。 分析由两个主要维度构成:时间和地理(在州一级)。 使用这两个维度作为支点,三个度量被汇总以按类型度量事件的影响; 即:a)人员伤亡; b)财产损失; 和c)作物损失。 这些结果提供的见解,可能有助于地方长官采取预防措施,以减少在他们的地理区域盛行
2、的天气事件的影响。 数据处理 # - Setup - #library(dplyr)library(ggplot2)library(lubridate)library(knitr)# - Constants Definition - #RECENCY_SPAN_IN_YEARS - 10 # Last X years Top Events by frequency, by geographic area C_NOT_DEFINED_STR - NOT DEFINEDC_NOT_DEFINED_INT - -1 -阶段1:加载源数据- -setwd(/Users/prosales/Documen
3、ts/Capacitaciones/Certificaciones/Coursera DS Certificate - Course 5 - Reproducible Research/Final Project/)natural_events_df - read.csv(repdata%2Fdata%2FStormData.csv.bz2)state_geocodes_df - read.csv(state-geocodes-v2015.csv)# -阶段2:数据准备:增强和重组- -regions_df % filter(division = 0 & state_fips = 0) % s
4、elect(region, name)colnames(regions_df) - c(region_id, region_name)divisions_df % filter(division != 0 & state_fips = 0) % select(division, name)colnames(divisions_df) - c(division_id, division_name)states_df % filter(state_fips != 0) % select(region, division, state_fips, name)colnames(states_df) -
5、 c(region_id, division_id, state_id, state_name)complete_geography_df - merge(states_df, regions_df, by = region_id)complete_geography_df - merge(complete_geography_df, divisions_df, by = division_id)complete_geography_df % select(region_id, region_name, division_id, division_name, state_id, state_n
6、ame)geography_structured_events_df - merge(natural_events_df, complete_geography_df, by.x = STATE_, by.y = state_id, all.x = TRUE)geography_structured_events_df % mutate(region_name = as.character(region_name)geography_structured_events_df % mutate(division_name = as.character(division_name)geograph
7、y_structured_events_df % mutate(state_name = as.character(state_name)geography_structured_events_df % mutate(region_name = replace(region_name, is.na(region_name), C_NOT_DEFINED_STR)geography_structured_events_df % mutate(division_name = replace(division_name, is.na(division_name), C_NOT_DEFINED_STR
8、)geography_structured_events_df % mutate(state_name = replace(state_name, is.na(state_name), C_NOT_DEFINED_STR)geography_structured_events_df % mutate(region_id = replace(region_id, is.na(region_id), C_NOT_DEFINED_INT)geography_structured_events_df % mutate(division_id = replace(division_id, is.na(d
9、ivision_id), C_NOT_DEFINED_INT)geography_structured_events_df % mutate(BGN_DATE = as.Date(BGN_DATE, format =%m/%d/%Y)# -第三阶段:按地理区域划分的频率,历史最高事件- -events_frequency_by_geography_df % count(region_name, state_name, EVTYPE)top_events_by_geography_df % group_by(region_name, state_name) % mutate(my_rank =
10、rank(desc(n) % filter(my_rank = 3)top_events_by_geography_df - top_events_by_geography_dfwith(top_events_by_geography_df, order(region_name, state_name, my_rank), max_dates_by_geography_df % filter(!is.na(BGN_DATE) % filter(is.Date(BGN_DATE) % group_by(region_name, state_name) % summarise(max_date =
11、 max(BGN_DATE) % mutate(event_date_lower_bound = max_date - years(RECENCY_SPAN_IN_YEARS)last_X_years_events_by_geography_df % filter(BGN_DATE event_date_lower_bound) % select(region_name, state_name, EVTYPE, BGN_DATE, event_date_lower_bound)last_X_years_events_frequency_by_geography_df % count(regio
12、n_name, state_name, EVTYPE)top_events_in_last_X_years_events_frequency_by_geography_df % group_by(region_name, state_name) % mutate(my_rank = rank(desc(n) % filter(my_rank = 3)top_events_in_last_X_years_events_frequency_by_geography_df - top_events_in_last_X_years_events_frequency_by_geography_dfwit
13、h(top_events_in_last_X_years_events_frequency_by_geography_df, order(region_name, state_name, my_rank), # -第四阶段:地理区域致命事件- -fatalities_by_event_type_by_geography_df % filter(!is.na(FATALITIES) % group_by(region_name, state_name, EVTYPE) % summarise(sum(FATALITIES)colnames(fatalities_by_event_type_by_
14、geography_df) - c(region_name, state_name, EVTYPE, fatalities_sum)top_deadliest_events_types_by_geography_df % mutate(my_rank = rank(desc(fatalities_sum) % filter(my_rank = 3)top_deadliest_events_types_by_geography_df - top_deadliest_events_types_by_geography_dfwith(top_deadliest_events_types_by_geo
15、graphy_df, order(region_name, state_name, my_rank), # -阶段5:按地理区域造成大部分财产损失的事件类型- - Stage 5: Events types that cause most property losses by geographic area - #property_losses_by_event_type_by_geography_df % filter(!is.na(PROPDMG) % group_by(region_name, state_name, EVTYPE) % summarise(sum(PROPDMG)col
16、names(property_losses_by_event_type_by_geography_df) - c(region_name, state_name, EVTYPE, property_losses_sum)top_property_costly_events_types_by_geography_df % mutate(my_rank = rank(desc(property_losses_sum) % filter(my_rank = 3)top_property_costly_events_types_by_geography_df - top_property_costly
17、_events_types_by_geography_dfwith(top_property_costly_events_types_by_geography_df, order(region_name, state_name, my_rank), top_property_costly_events_types_by_geography_df % mutate(property_losses_sum = property_losses_sum / 1000)# - Stage 6: Events types that cause most crop losses by geographic
18、area - #crop_losses_by_event_type_by_geography_df % filter(!is.na(CROPDMG) % group_by(region_name, state_name, EVTYPE) % summarise(sum(CROPDMG)colnames(crop_losses_by_event_type_by_geography_df) - c(region_name, state_name, EVTYPE, crop_losses_sum)top_crop_costly_events_types_by_geography_df % mutat
19、e(my_rank = rank(desc(crop_losses_sum) % filter(my_rank = 3)top_crop_costly_events_types_by_geography_df - top_crop_costly_events_types_by_geography_dfwith(top_crop_costly_events_types_by_geography_df, order(region_name, state_name, my_rank), top_crop_costly_events_types_by_geography_df % mutate(cro
20、p_losses_sum = crop_losses_sum / 1000)# - Stage 7: Events occurence by geographic area by month, during the last X years recorded - #last_X_years_events_frequency_by_month_by_geography_df % count(region_name, state_name, EVTYPE, event_month)top_events_in_last_X_years_by_month_by_geography_df - merge
21、(last_X_years_events_frequency_by_month_by_geography_df, top_events_in_last_X_years_events_frequency_by_geography_df, by.x = c(state_name, EVTYPE), by.y = c(state_name, EVTYPE) )top_events_in_last_X_years_by_month_by_geography_df % select(region_name.y, state_name, EVTYPE, event_month, n.x)colnames(
22、top_events_in_last_X_years_by_month_by_geography_df) - c(region_name, state_name, EVTYPE, event_month, n)# - Stage 8: Deadliest events by geographic area by month - #fatalities_by_geography_by_event_type_by_month_df % mutate(event_month = month(BGN_DATE) % filter(!is.na(FATALITIES) % group_by(region
23、_name, state_name, EVTYPE, event_month) % summarise(sum(FATALITIES)colnames(fatalities_by_geography_by_event_type_by_month_df) - c(region_name, state_name, EVTYPE, event_month, fatalities_sum)top_fatalities_by_geography_by_event_type_by_month_df - merge(fatalities_by_geography_by_event_type_by_month
24、_df, top_deadliest_events_types_by_geography_df, by.x = c(state_name, EVTYPE), by.y = c(state_name, EVTYPE)top_fatalities_by_geography_by_event_type_by_month_df % select(region_name.x, state_name, EVTYPE, event_month, fatalities_sum.x)colnames(top_fatalities_by_geography_by_event_type_by_month_df) -
25、 c(region_name, state_name, EVTYPE, event_month,fatalities_sum)# - Stage 9: Events types that cause most PROPERTY losses by geographic area, by month - #property_losses_by_geography_by_event_type_by_month_df % mutate(event_month = month(BGN_DATE) % filter(!is.na(PROPDMG) % group_by(region_name, state_name, EVTYPE, event_month) % summarise(sum(PROPDMG)colnames(property_losses_by_geography_by_event_type_by_month_df) - c(region_name
copyright@ 2008-2022 冰豆网网站版权所有
经营许可证编号:鄂ICP备2022015515号-1