First, you’ll download and set up R and RStudio, a free integrated development environment for R. RStudio helps you work in R by providing a coding platform with access to CRAN, the Comprehensive R Archive Network, which contains thousands of R libraries, a built-in viewer for charts and graphs, and other useful features.
If necessary, download R 3.4.0 or later. Accept all defaults in the installation wizard.
If necessary, download RStudio Desktop. Accept all defaults in the installation wizard.
if(!require(readr)) install.packages("readr")
if(!require(dplyr)) install.packages("dplyr")
if(!require(DT)) install.packages("DT")
if(!require(ggrepel)) install.packages("ggrepel")
if(!require(leaflet)) install.packages("leaflet")
Load the data using readr and read_csv().
library(readr)
# path <- "http://spatial.binghamton.edu/projects/crime/data/SF_Crime_2007_2016.csv"
path <- "D:\\Data\\Crime\\SF_Crime_2007_2016.csv"
df <- read_csv(path)
Display the data using DT and datatable().
library(DT)
df_sub <- df[1:100,] # display the first 100 rows
df_sub$Time <- as.character(df_sub$Time)
datatable(df_sub, options = list(pageLength = 5,scrollX='400px'))
sprintf("Number of Rows in Dataframe: %s", format(nrow(df),big.mark = ","))
## [1] "Number of Rows in Dataframe: 1,435,794"
The All-Caps text is difficult to read. Let’s force the text in the appropriate columns into proper case.
# str(df)
proper_case <- function(x) {
return (gsub("\\b([A-Z])([A-Z]+)", "\\U\\1\\L\\2" , x, perl=TRUE))
}
library(dplyr)
df <- df %>% mutate(Category = proper_case(Category),
Descript = proper_case(Descript),
PdDistrict = proper_case(PdDistrict),
Resolution = proper_case(Resolution),
Time = as.character(Time))
df_sub <- df[1:100,] # display the first 100 rows
datatable(df_sub, options = list(pageLength = 5,scrollX='400px'))
Display crime incident locations on the map using leaflet. Click icons on the map to show incident details.
library(leaflet)
data <- df[1:10000,] # display the first 10,000 rows
data$popup <- paste("<b>Incident #: </b>", data$IncidntNum, "<br>", "<b>Category: </b>", data$Category,
"<br>", "<b>Description: </b>", data$Descript,
"<br>", "<b>Day of week: </b>", data$DayOfWeek,
"<br>", "<b>Date: </b>", data$Date,
"<br>", "<b>Time: </b>", data$Time,
"<br>", "<b>PD district: </b>", data$PdDistrict,
"<br>", "<b>Resolution: </b>", data$Resolution,
"<br>", "<b>Address: </b>", data$Address,
"<br>", "<b>Longitude: </b>", data$X,
"<br>", "<b>Latitude: </b>", data$Y)
leaflet(data, width = "100%") %>% addTiles() %>%
addTiles(group = "OSM (default)") %>%
addProviderTiles(provider = "Esri.WorldStreetMap",group = "World StreetMap") %>%
addProviderTiles(provider = "Esri.WorldImagery",group = "World Imagery") %>%
# addProviderTiles(provider = "NASAGIBS.ViirsEarthAtNight2012",group = "Nighttime Imagery") %>%
addMarkers(lng = ~X, lat = ~Y, popup = data$popup, clusterOptions = markerClusterOptions()) %>%
addLayersControl(
baseGroups = c("OSM (default)","World StreetMap", "World Imagery"),
options = layersControlOptions(collapsed = FALSE)
)
library(dplyr)
df_crime_daily <- df %>%
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
group_by(Date) %>%
summarize(count = n()) %>%
arrange(Date)
library(ggplot2)
library(scales)
plot <- ggplot(df_crime_daily, aes(x = Date, y = count)) +
geom_line(color = "#F2CA27", size = 0.1) +
geom_smooth(color = "#1A1A1A") +
# fte_theme() +
scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%Y")) +
labs(x = "Date of Crime", y = "Number of Crimes", title = "Daily Crimes in San Francisco from 2007 – 2016")
plot
Summarize the data by incident category.
df_category <- sort(table(df$Category),decreasing = TRUE)
df_category <- data.frame(df_category[df_category > 10000])
colnames(df_category) <- c("Category", "Frequency")
df_category$Percentage <- df_category$Frequency / sum(df_category$Frequency)
datatable(df_category, options = list(scrollX='400px'))
Create a bar plot based on the incident category.
library(ggplot2)
library(ggrepel)
bp<-ggplot(df_category, aes(x=Category, y=Frequency, fill=Category)) + geom_bar(stat="identity") +
theme(axis.text.x=element_blank()) + geom_text_repel(data=df_category, aes(label=Category))
bp
Create a pie chart based on the incident category.
bp<-ggplot(df_category, aes(x="", y=Percentage, fill=Category)) + geom_bar(stat="identity")
pie <- bp + coord_polar("y")
pie
Create a chart of crimes (Larceny/Theft) over time.
df_theft <- df %>% filter(grepl("Larceny/Theft", Category))
df_theft_daily <- df_theft %>%
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
group_by(Date) %>%
summarize(count = n()) %>%
arrange(Date)
library(ggplot2)
library(scales)
plot <- ggplot(df_theft_daily, aes(x = Date, y = count)) +
geom_line(color = "#F2CA27", size = 0.1) +
geom_smooth(color = "#1A1A1A") +
# fte_theme() +
scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%Y")) +
labs(x = "Date of Theft", y = "Number of Thefts", title = "Daily Thefts in San Francisco from 2007 – 2016")
plot
Aggregate counts of thefts by Day-of-Week and Time to create heat map. Fortunately, the Day-Of-Week part is pre-derived, but Hour is slightly harder.
get_hour <- function(x) {
return (as.numeric(strsplit(x,":")[[1]][1]))
}
df_theft_time <- df_theft %>%
mutate(Hour = sapply(Time, get_hour)) %>%
group_by(DayOfWeek, Hour) %>%
summarize(count = n())
# df_theft_time %>% head(10)
datatable(df_theft_time, options = list(scrollX='400px'))
Reorder and format Factors.
dow_format <- c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_theft_time$DayOfWeek <- factor(df_theft_time$DayOfWeek, level = rev(dow_format))
df_theft_time$Hour <- factor(df_theft_time$Hour, level = 0:23, label = hour_format)
# df_theft_time %>% head(10)
datatable(df_theft_time, options = list(scrollX='400px'))
Create Time Heatmap
plot <- ggplot(df_theft_time, aes(x = Hour, y = DayOfWeek, fill = count)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
labs(x = "Hour of Theft (Local Time)", y = "Day of Week of Theft", title = "Number of Thefts in San Francisco from 2007 – 2016, by Time of Theft") +
scale_fill_gradient(low = "white", high = "#27AE60", labels = comma)
plot
Hmm, why is there a surge at 6-7PM on weekdays?
Create a chart of arrests over time.
df_arrest <- df %>% filter(grepl("Arrest", Resolution))
df_arrest_daily <- df_arrest %>%
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
group_by(Date) %>%
summarize(count = n()) %>%
arrange(Date)
library(ggplot2)
library(scales)
plot <- ggplot(df_arrest_daily, aes(x = Date, y = count)) +
geom_line(color = "#F2CA27", size = 0.1) +
geom_smooth(color = "#1A1A1A") +
# fte_theme() +
scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%Y")) +
labs(x = "Date of Arrest", y = "# of Police Arrests", title = "Daily Police Arrests in San Francisco from 2007 – 2016")
plot
get_hour <- function(x) {
return (as.numeric(strsplit(x,":")[[1]][1]))
}
df_arrest_time <- df_arrest %>%
mutate(Hour = sapply(Time, get_hour)) %>%
group_by(DayOfWeek, Hour) %>%
summarize(count = n())
dow_format <- c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_arrest_time$DayOfWeek <- factor(df_arrest_time$DayOfWeek, level = rev(dow_format))
df_arrest_time$Hour <- factor(df_arrest_time$Hour, level = 0:23, label = hour_format)
plot <- ggplot(df_arrest_time, aes(x = Hour, y = DayOfWeek, fill = count)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Number of Police Arrests in San Francisco from 2007 – 2016, by Time of Arrest") +
scale_fill_gradient(low = "white", high = "#27AE60", labels = comma)
plot
Hmm, why is there a surge on Wednesday afternoon, and at 4-5PM on all days? Let’s look at subgroups to verify there isn’t a latent factor.
Certain types of crime may be more time dependent. (e.g., more traffic violations when people leave work)
df_top_crimes <- df_arrest %>%
group_by(Category) %>%
summarize(count = n()) %>%
arrange(desc(count))
datatable(df_top_crimes, options = list(pageLength = 10,scrollX='400px'))
df_arrest_time_crime <- df_arrest %>%
filter(Category %in% df_top_crimes$Category[2:19]) %>%
mutate(Hour = sapply(Time, get_hour)) %>%
group_by(Category, DayOfWeek, Hour) %>%
summarize(count = n())
df_arrest_time_crime$DayOfWeek <- factor(df_arrest_time_crime$DayOfWeek, level = rev(dow_format))
df_arrest_time_crime$Hour <- factor(df_arrest_time_crime$Hour, level = 0:23, label = hour_format)
datatable(df_arrest_time_crime, options = list(pageLength = 10,scrollX='400px'))
plot <- ggplot(df_arrest_time_crime, aes(x = Hour, y = DayOfWeek, fill = count)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Number of Police Arrests in San Francisco from 2007 – 2016, by Category and Time of Arrest") +
scale_fill_gradient(low = "white", high = "#2980B9") +
facet_wrap(~ Category, nrow = 6)
plot
Good, but the gradients aren’t helpful because they are not normalized. We need to normalize the range on each facet. (unfortunately, this makes the value of the gradient unhelpful)
df_arrest_time_crime <- df_arrest_time_crime %>%
group_by(Category) %>%
mutate(norm = count/sum(count))
datatable(df_arrest_time_crime, options = list(pageLength = 10,scrollX='400px'))
plot <- ggplot(df_arrest_time_crime, aes(x = Hour, y = DayOfWeek, fill = norm)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Type of Crime") +
scale_fill_gradient(low = "white", high = "#2980B9") +
facet_wrap(~ Category, nrow = 6)
plot
Same as above, but with a different facet.
df_arrest_time_district <- df_arrest %>%
mutate(Hour = sapply(Time, get_hour)) %>%
group_by(PdDistrict, DayOfWeek, Hour) %>%
summarize(count = n()) %>%
group_by(PdDistrict) %>%
mutate(norm = count/sum(count))
df_arrest_time_district$DayOfWeek <- factor(df_arrest_time_district$DayOfWeek, level = rev(dow_format))
df_arrest_time_district$Hour <- factor(df_arrest_time_district$Hour, level = 0:23, label = hour_format)
datatable(df_arrest_time_district, options = list(pageLength = 10,scrollX='400px'))
plot <- ggplot(df_arrest_time_district, aes(x = Hour, y = DayOfWeek, fill = norm)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Station") +
scale_fill_gradient(low = "white", high = "#8E44AD") +
facet_wrap(~ PdDistrict, nrow = 5)
plot
If crime is tied to activities, the period at which activies end may impact.
df_arrest_time_month <- df_arrest %>%
mutate(Month = format(as.Date(Date, "%m/%d/%Y"), "%B"), Hour = sapply(Time, get_hour)) %>%
group_by(Month, DayOfWeek, Hour) %>%
summarize(count = n()) %>%
group_by(Month) %>%
mutate(norm = count/sum(count))
df_arrest_time_month$DayOfWeek <- factor(df_arrest_time_month$DayOfWeek, level = rev(dow_format))
df_arrest_time_month$Hour <- factor(df_arrest_time_month$Hour, level = 0:23, label = hour_format)
# Set order of month facets by chronological order instead of alphabetical
df_arrest_time_month$Month <- factor(df_arrest_time_month$Month,
level = c("January","February","March","April","May","June","July","August","September","October","November","December"))
plot <- ggplot(df_arrest_time_month, aes(x = Hour, y = DayOfWeek, fill = norm)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Month") +
scale_fill_gradient(low = "white", high = "#E74C3C") +
facet_wrap(~ Month, nrow = 4)
plot
Perhaps things changed overtime?
df_arrest_time_year <- df_arrest %>%
mutate(Year = format(as.Date(Date, "%m/%d/%Y"), "%Y"), Hour = sapply(Time, get_hour)) %>%
group_by(Year, DayOfWeek, Hour) %>%
summarize(count = n()) %>%
group_by(Year) %>%
mutate(norm = count/sum(count))
df_arrest_time_year$DayOfWeek <- factor(df_arrest_time_year$DayOfWeek, level = rev(dow_format))
df_arrest_time_year$Hour <- factor(df_arrest_time_year$Hour, level = 0:23, label = hour_format)
plot <- ggplot(df_arrest_time_year, aes(x = Hour, y = DayOfWeek, fill = norm)) +
geom_tile() +
# fte_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Year") +
scale_fill_gradient(low = "white", high = "#E67E22") +
facet_wrap(~ Year, nrow = 6)
plot
Create Space Time Cube By Aggregating Points
Emerging Hot Spot Analysis
Add additional attributes to the dataset.
Identify areas with unusually high crime rates.