For this project, we looked at the Environmental Protection Agency’s fuel economy data for every vehicle tested at the National Vehicle and Fuel Emissions Laboratory in Ann Arbor, Michigan since 1984. This dataset has data on almost 37,000 different vehicles, and includes many attributes to analyze. This is the R and SQL replication of the Tableau visualizations in Project 4. In addtion, we created our own dataset called CAR_Sales which contains the total number of cars sold in the US from 1984 to 2014. The numbers came from this website.
require("jsonlite")
## Loading required package: jsonlite
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:utils':
##
## View
require("RCurl")
## Loading required package: RCurl
## Loading required package: bitops
# Loads the data from Fast Food table into Fast Food dataframe
# Change the USER and PASS below to be your UTEid
vehicles <- data.frame(fromJSON(getURL(URLencode('skipper.cs.utexas.edu:5001/rest/native/?query="select ATVTYPE,BARRELS08,BARRELSA08,CITY08,CITYA08,CO2TAILPIPEAGPM,CO2TAILPIPEGPM,COMB08,COMBA08,CYLINDERS,FUELCOST08,FUELCOSTA08,FUELTYPE,FUELTYPE1,FUELTYPE2,HIGHWAY08,HIGHWAYA08,HLV,HPV,LV2,LV4,MPGDATA,PV2,PV4,YEAR,MAKE,TRANY from VEHICLES"'),httpheader=c(DB='jdbc:oracle:thin:@sayonara.microlab.cs.utexas.edu:1521:orcl', USER='C##cs329e_sm44585', PASS='orcl_sm44585', MODE='native_mode', MODEL='model', returnDimensions = 'False', returnFor = 'JSON'), verbose = TRUE),))
summary(vehicles)
## ATVTYPE BARRELS08 BARRELSA08 CITY08
## null :33825 Min. : 0.0599 Min. : 0.0000 Min. : 6.00
## FFV : 1280 1st Qu.:14.9643 1st Qu.: 0.0000 1st Qu.: 15.00
## Diesel : 1018 Median :17.3375 Median : 0.0000 Median : 17.00
## Hybrid : 416 Mean :17.6101 Mean : 0.2144 Mean : 17.81
## EV : 95 3rd Qu.:20.5896 3rd Qu.: 0.0000 3rd Qu.: 20.00
## CNG : 50 Max. :47.0683 Max. :18.3043 Max. :138.00
## (Other): 66
## CITYA08 CO2TAILPIPEAGPM CO2TAILPIPEGPM COMB08
## Min. : 0.0000 Min. : 0.00 Min. : 0.0 Min. : 7.00
## 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 401.0 1st Qu.: 16.00
## Median : 0.0000 Median : 0.00 Median : 467.7 Median : 19.00
## Mean : 0.4855 Mean : 17.77 Mean : 475.5 Mean : 20.05
## 3rd Qu.: 0.0000 3rd Qu.: 0.00 3rd Qu.: 555.4 3rd Qu.: 23.00
## Max. :127.0000 Max. :719.00 Max. :1269.6 Max. :124.00
##
## COMBA08 CYLINDERS FUELCOST08 FUELCOSTA08
## Min. : 0.000 4 :14010 Min. : 500 Min. : 0.00
## 1st Qu.: 0.000 6 :12838 1st Qu.:1600 1st Qu.: 0.00
## Median : 0.000 8 : 8099 Median :1950 Median : 0.00
## Mean : 0.546 5 : 766 Mean :1968 Mean : 91.73
## 3rd Qu.: 0.000 12 : 533 3rd Qu.:2300 3rd Qu.: 0.00
## Max. :117.000 3 : 200 Max. :5900 Max. :4000.00
## (Other): 304
## FUELTYPE FUELTYPE1 FUELTYPE2
## Regular :24592 Diesel : 1090 E85 : 1280
## Premium : 9506 Electricity : 95 Electricity: 38
## Gasoline or E85: 1164 Midgrade Gasoline: 61 Natural Gas: 20
## Diesel : 1090 Natural Gas : 60 null :35404
## Premium or E85 : 116 Premium Gasoline : 9645 Propane : 8
## Electricity : 95 Regular Gasoline :25799
## (Other) : 187
## HIGHWAY08 HIGHWAYA08 HLV HPV
## Min. : 9.00 Min. : 0.0000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 20.00 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 24.00 Median : 0.0000 Median : 0.000 Median : 0.00
## Mean : 23.95 Mean : 0.6506 Mean : 2.036 Mean : 10.49
## 3rd Qu.: 27.00 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.00
## Max. :111.00 Max. :107.0000 Max. :49.000 Max. :195.00
##
## LV2 LV4 MPGDATA PV2
## Min. : 0.000 Min. : 0.00 N:25512 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.00 Y:11238 1st Qu.: 0.00
## Median : 0.000 Median : 0.00 Median : 0.00
## Mean : 1.856 Mean : 6.19 Mean : 13.68
## 3rd Qu.: 0.000 3rd Qu.:13.00 3rd Qu.: 0.00
## Max. :41.000 Max. :55.00 Max. :194.00
##
## PV4 YEAR MAKE TRANY
## Min. : 0.00 Min. :1984 Chevrolet: 3705 Automatic 4-spd:11039
## 1st Qu.: 0.00 1st Qu.:1990 Ford : 3042 Manual 5-spd : 8283
## Median : 0.00 Median :2000 Dodge : 2498 Automatic 3-spd: 3151
## Mean : 33.74 Mean :2000 GMC : 2345 Automatic (S6) : 2413
## 3rd Qu.: 90.00 3rd Qu.:2009 Toyota : 1863 Manual 6-spd : 2326
## Max. :192.00 Max. :2016 BMW : 1600 Automatic 5-spd: 2184
## (Other) :21697 (Other) : 7354
require("jsonlite")
require("RCurl")
# Loads the data from Total Car Sales table into CAR_Sale dataframe
# Change the USER and PASS below to be your UTEid
CAR_Sale <- data.frame(fromJSON(getURL(URLencode('skipper.cs.utexas.edu:5001/rest/native/?query="select YEAR, CAR_SALE from CAR_SALE"'),httpheader=c(DB='jdbc:oracle:thin:@sayonara.microlab.cs.utexas.edu:1521:orcl', USER='C##cs329e_cz4795', PASS='orcl_cz4795', MODE='native_mode', MODEL='model', returnDimensions = 'False', returnFor = 'JSON'), verbose = TRUE),))
summary(CAR_Sale)
## YEAR CAR_SALE
## Min. :1984 Min. : 5400890
## 1st Qu.:1992 1st Qu.: 7611158
## Median :1999 Median : 8213113
## Mean :1999 Mean : 8338861
## 3rd Qu.:2006 3rd Qu.: 8884120
## Max. :2014 Max. :11404239
#Before running this R file make sure you set you working directory to where the CSV file located.
file_path <- "vehicles.csv"
df <- read.csv(file_path, stringsAsFactors = FALSE)
# Replace "." (i.e., period) with "_" in the column names.
names(df) <- gsub("\\.+", "_", names(df))
str(df) # Uncomment this and run just the lines to here to get column types to use for getting the list of measures.
# Generate List of Measures
measures <- c("barrels08", "barrelsA08", "charge120","charge240","city08", "cityA08", "co2TailpipeAGpm", "co2TailpipeGpm", "comb08", "combA08", "fuelCost08", "fuelCostA08", "highway08", "highwayA08", "hlv","hpv", "lv2", "lv4", "pv2", "pv4")
# Get rid of special characters in each column.
# Google ASCII Table to understand the following:
for(n in names(df)) {
df[n] <- data.frame(lapply(df[n], gsub, pattern="[^ -~]",replacement= ""))
}
dimensions <- setdiff(names(df), measures)
#dimensions
if( length(measures) > 1 || ! is.na(dimensions)) {
for(d in dimensions) {
# Get rid of " and ' in dimensions.
df[d] <- data.frame(lapply(df[d], gsub, pattern="[\"']",replacement= ""))
# Change & to and in dimensions.
df[d] <- data.frame(lapply(df[d], gsub, pattern="&",replacement= " and "))
# Change : to ; in dimensions.
df[d] <- data.frame(lapply(df[d], gsub, pattern=":",replacement= ";"))
}
}
# Get rid of all characters in measures except for numbers, the - sign, and period.dimensions
if( length(measures) > 1 || ! is.na(measures)) {
for(m in measures) {
df[m] <- data.frame(lapply(df[m], gsub, pattern="[^--.0-9]",replacement= ""))
}
}
write.csv(df, paste(gsub(".csv", "", file_path), ".reformatted.csv", sep=""), row.names=FALSE, na = "")
tableName <- gsub(" +", "_", gsub("[^A-z, 0-9, ]", "", gsub(".csv", "", file_path)))
sql <- paste("CREATE TABLE", tableName, "(\n-- Change table_name to the table name you want.\n")
if( length(measures) > 1 || ! is.na(dimensions)) {
for(d in dimensions) {
sql <- paste(sql, paste(d, "varchar2(4000),\n"))
}
}
if( length(measures) > 1 || ! is.na(measures)) {
for(m in measures) {
if(m != tail(measures, n=1)) sql <- paste(sql, paste(m, "number(38,4),\n"))
else sql <- paste(sql, paste(m, "number(38,4)\n"))
}
}
sql <- paste(sql, ");")
cat(sql)
#Before running this R file make sure you set you working directory to where the CSV file located.
file_path <- "Car_Sale.csv"
df <- read.csv(file_path, stringsAsFactors = FALSE)
# Replace "." (i.e., period) with "_" in the column names.
names(df) <- gsub("\\.+", "_", names(df))
str(df) # Uncomment this and run just the lines to here to get column types to use for getting the list of measures.
# Generate List of Measures
measures <- c("Car_Sale")
# Get rid of special characters in each column.
# Google ASCII Table to understand the following:
for(n in names(df)) {
df[n] <- data.frame(lapply(df[n], gsub, pattern="[^ -~]",replacement= ""))
}
dimensions <- setdiff(names(df), measures)
#dimensions
if( length(measures) > 1 || ! is.na(dimensions)) {
for(d in dimensions) {
# Get rid of " and ' in dimensions.
df[d] <- data.frame(lapply(df[d], gsub, pattern="[\"']",replacement= ""))
# Change & to and in dimensions.
df[d] <- data.frame(lapply(df[d], gsub, pattern="&",replacement= " and "))
# Change : to ; in dimensions.
df[d] <- data.frame(lapply(df[d], gsub, pattern=":",replacement= ";"))
}
}
# Get rid of all characters in measures except for numbers, the - sign, and period.dimensions
if( length(measures) > 1 || ! is.na(measures)) {
for(m in measures) {
df[m] <- data.frame(lapply(df[m], gsub, pattern="[^--.0-9]",replacement= ""))
}
}
write.csv(df, paste(gsub(".csv", "", file_path), ".reformatted.csv", sep=""), row.names=FALSE, na = "")
tableName <- gsub(" +", "_", gsub("[^A-z, 0-9, ]", "", gsub(".csv", "", file_path)))
sql <- paste("CREATE TABLE", tableName, "(\n-- Change table_name to the table name you want.\n")
if( length(measures) > 1 || ! is.na(dimensions)) {
for(d in dimensions) {
sql <- paste(sql, paste(d, "varchar2(4000),\n"))
}
}
if( length(measures) > 1 || ! is.na(measures)) {
for(m in measures) {
if(m != tail(measures, n=1)) sql <- paste(sql, paste(m, "number(38,4),\n"))
else sql <- paste(sql, paste(m, "number(38,4)\n"))
}
}
sql <- paste(sql, ");")
cat(sql)
The first plot in Tableau is a bar chart with a reference line that looks at the effect of transmission type on highway and city MPG. Our first impression was that a manual transmission would produce the best MPG, but we were surprised to learn that an automatic 6-speed transmission produces not only the highest city MPG but also the highest highway MPG. Doing some further research, we found that manual transmissions used to be more fuel efficent, but because of recent advances in automatic transmission technology, automatic transmissions have started to become more efficient in some cases.
require(tidyr)
## Loading required package: tidyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggplot2)
## Loading required package: ggplot2
require(reshape2)
## Loading required package: reshape2
#R workflow
bar_chart <- vehicles %>% select(TRANY, HIGHWAY08, CITY08) %>% subset(TRANY %in% c("Automatic 3-spd", "Automatic 4-spd","Automatic 5-spd","Automatic 6-spd","Automatic 6spd","Automatic 7-spd", "Automatic 8-spd", "Automatic 9-spd", "Manual 3-spd", "Manual 4-spd", "Manual 5-spd", "Manual 5 spd", "Manual 6-spd", "Manual 7-spd")) %>% group_by(TRANY) %>% summarise(avg_city_MPG = mean(CITY08), avg_highway_MPG = mean(HIGHWAY08)) %>% melt(id.vars = c("TRANY")) %>% group_by(variable) %>% mutate(WINDOW_AVG_MPG = mean(value))
#Plot Function to generate bar chart with reference line and values
ggplot() +
coord_cartesian() +
scale_x_discrete() +
scale_y_continuous() +
facet_wrap(~variable) +
labs(title='Average Highway and City MPG based on transmission ') +
labs(x=paste("Transmission"), y=paste("MPG")) +
layer(data=bar_chart,
mapping=aes(x=TRANY, y=value),
stat="identity",
stat_params=list(),
geom="bar",
geom_params=list(colour="blue", fill="white"),
position=position_dodge()
) + coord_flip() +
layer(data=bar_chart,
mapping=aes(x=TRANY, y=value, label=round(WINDOW_AVG_MPG, 2)),
stat="identity",
stat_params=list(),
geom="text",
geom_params=list(colour="black", hjust=2),
position=position_identity()
) +
layer(data=bar_chart,
mapping=aes(yintercept = WINDOW_AVG_MPG),
geom="hline",
geom_params=list(colour="red")
) +
layer(data=bar_chart,
mapping=aes(x=TRANY, y=value, label=round(value, 2)),
stat="identity",
stat_params=list(),
geom="text",
geom_params=list(colour="black", hjust=0),
position=position_identity()
)
The next tableau plot is a scatterplot that looks at how the combined MPG of all cars since 1984 has evolved over the years. This plot was created to see if a trend was present. Initially, we were fairly confident that a positive trend would be present due to the recent advances in EV technology and general improvements in the efficency of internal combustion engines.
However, what we were surprised to find was how drastic and inconsistent the trend was. From 1984 to about 1998, no car that the EPA tested ever got above a combined MPG rating of 50, resutling in a relatively flat trend. Then from 1998 to 2003, there are numerous vehicles getting above the 50 MPG mark, resulting in a very steep positive trend for those years. From 2003 to just before 2010, all but one vehicle got less than 55 MPG, so there is a very steep decline and then flat trend for that time period. From 2010 onward, a steep positive trend emerges again as the number of vehicles above the 55 MPG mark explodes.
# This file creates the scatterplot.
require(tidyr)
require(dplyr)
require(ggplot2)
# This selects only the comb08 and year columns whilte transforming the year column to a date
scatterplot <- vehicles %>% select(COMB08, YEAR) %>% transform(YEAR = as.Date(as.character(YEAR), "%Y"))
ggplot() +
coord_cartesian() +
scale_x_date() +
scale_y_continuous() +
labs(title="Combined MPG of every model year") +
labs(x="Year", y="Combined MPG") +
layer(data=scatterplot ,
mapping=aes(x=YEAR, y=COMB08),
stat="identity",
stat_params=list(),
geom="point",
geom_params=list(),
position=position_identity()
)
The next plot is a tableau cross tab with a Key Performance Indicator that provides a visual representation of which vehicle manufacturers produced 2 door cars that were efficent (high combined MPG) and spacious (as measured by the passenger volume of the vehicle). Plots 7 and 8 look at 4 door cars since 1984. To calculate the ratio used for the KPI, we summed up the combined MPG of all the 2 door vehicles that a manufacturer produced during a model year and divided that by the total passenger volume for all those 2 door vehicles. The KPI categories are as follows:
The interesting takeaway from this graph is that it visually represents a car company’s 2 door vehicle portfolio in terms of how efficent those cars are relative to how much passenger volume it has (which, for the purposes of our analysis, is used as a proxy for the size of the vehicle). It also shows that some car companies, like Chevy and Ford, have generally improved the efficency of their vehicles relative to how large the car is. There are also other companies, like Honda, that have newer vehicles achieving much worse efficieny to size ratios than the older models.
require(jsonlite)
require(RCurl)
# The following is equivalent to creat a crosstab with two KPIs in Tableau"
MPG_PV2_KPI_LOW = 1
MPG_PV2_KPI_HIGH = 2
# The following is equivalent to creat a crosstab with two KPIs in Tableau"
crosstab <- vehicles %>% group_by(MAKE, YEAR) %>% summarize(sum_comb08 = sum(COMB08), sum_pv2 = sum(PV2),sum_pv4 = sum(PV4)) %>% mutate(ratio_1 = sum_comb08 / (sum_pv2))%>% mutate(ratio_2 = sum_comb08 / (sum_pv4)) %>% mutate(kpi_1 = ifelse(ratio_1 < MPG_PV2_KPI_LOW, '03 Not Efficient or Spacious', ifelse(ratio_1 <= MPG_PV2_KPI_HIGH, '02 Average Efficiency and Space', '01 Efficient and Spacious')))%>% mutate(kpi_2 = ifelse(ratio_2 < MPG_PV2_KPI_LOW, '03 Not Efficient or Spacious', ifelse(ratio_2 <= MPG_PV2_KPI_HIGH, '02 Average Efficiency and Space', '01 Efficient and Spacious'))) %>%filter(MAKE %in% c("Acura", "Aston Martin", "Audi", "Bentley", "BMW", "Buick", "Chevrolet", "Dodge", "Ferrari", "Ford", "Honda", "Kia", "Lincoln", "Lexus", "Maserati", "Mazda", "Mercedes-Benz", "Nissan", "Toyota", "Volkswagen")) %>% filter(ratio_1 != Inf, ratio_2 != Inf)
# This line turns the make and year columns into ordered factors.
crosstab <- crosstab %>% transform(MAKE = ordered(MAKE), YEAR = ordered(YEAR))
#This generates the PV2 plot with combined MPG
ggplot() +
coord_cartesian() +
scale_x_discrete() +
scale_y_discrete() +
labs(title='Vehicle Crosstab of Efficiency/Space ratio for 2 door cars') +
labs(x=paste("Make"), y=paste("Year")) +
layer(data=crosstab,
mapping=aes(x=MAKE, y=YEAR, label=round(ratio_1, 2)),
stat="identity",
stat_params=list(),
geom="text",
geom_params=list(colour="black"),
position=position_identity()
) +
layer(data=crosstab,
mapping=aes(x=MAKE, y=YEAR, fill=kpi_1),
stat="identity",
stat_params=list(),
geom="tile",
geom_params=list(alpha=0.50),
position=position_identity()
)
The final tableau plot is a cross tab that provides a visual representation of which vehicle manufacturers produced 4 door cars that were efficent (high combined MPG) and spacious (as measured by the passenger volume of the vehicle). To calculate the ratio used for the KPI, we summed up the combined MPG of all the 4 door vehicles that a manufacturer produced during a model year and divided that by the total passenger volume for all those 4 door vehicles. The KPI categories are as follows:
The interesting takeaway from this graph is that it visually represents a car company’s 4 door vehicle portfolio in terms of how efficent those cars are relative to how much passenger volume it has (which, for the purposes of our analysis, is used as a proxy for the size of the vehicle).
#This generates the PV4 with combined MPG plot
ggplot() +
coord_cartesian() +
scale_x_discrete() +
scale_y_discrete() +
labs(title='Vehicle Crosstab of Efficiency/Space ratio for 4 door cars') +
labs(x=paste("Make"), y=paste("Year")) +
layer(data=crosstab,
mapping=aes(x=MAKE, y=YEAR, label=round(ratio_2, 2)),
stat="identity",
stat_params=list(),
geom="text",
geom_params=list(colour="black"),
position=position_identity()
) +
layer(data=crosstab,
mapping=aes(x=MAKE, y=YEAR, fill=kpi_2),
stat="identity",
stat_params=list(),
geom="tile",
geom_params=list(alpha=0.50),
position=position_identity()
)
For this plot, we blended together the vehicles and car sales datasets based on the year. In other words, the vehicles dataset was grouped by the model year to get a count of the number of vehicles that the EPA tested for that model year and was blended with the Car Sales dataset based on the year. This gave us the ability to see what relationship existed between the number of cars the EPA tested and the number of cars sold in the United States for a given year.
In order to produce a good visualization, we had to filter the data to model years 2009 to 2014. Also, the number of cars tested by the EPA is significantly smaller than the total car sales for any given year so while it may not look like two bars are shown for each year, there is in fact one bar for the total number of cars tested by the EPA for a model year and another bar for the total number of cars sold for that model year.
We were surprised to learn that the EPA, on average from 2009 to 2014, tests roughly the same number of vehicles per model year, despite the fact that the number of vehicles sold during that time period has increased by almost 50%! This could be due to a number of things including perhaps the EPA has the testing capacity limit on the number of vehicles they can test per year.
require("jsonlite")
require("RCurl")
require(ggplot2)
require(dplyr)
# The following is equivalent to "04 Blending 2 Data Sources.twb"
df <- data.frame(fromJSON(getURL(URLencode(gsub("\n", " ", 'skipper.cs.utexas.edu:5001/rest/native/?query=
"select YEAR || \\\' Total_Vehicles_Tested\\\' as measure_names, count(MAKE) as measure_values from VEHICLES
where YEAR between 2009 and 2014
group by YEAR
union all
select YEAR || \\\' Total_Vehicles_Sold\\\' as measure_names, CAR_SALE as measure_values from CAR_SALE
where YEAR between 2009 and 2014
order by measure_names;"
')), httpheader=c(DB='jdbc:oracle:thin:@sayonara.microlab.cs.utexas.edu:1521:orcl', USER='C##cs329e_cz4795', PASS='orcl_cz4795', MODE='native_mode', MODEL='model', returnDimensions = 'False', returnFor = 'JSON'), verbose = TRUE)));
ggplot() +
coord_cartesian() +
scale_x_discrete() +
scale_y_continuous() +
#facet_wrap(~CLARITY, ncol=1) +
labs(title='Blending 2 Data Sources') +
labs(x=paste("Year Sales"), y=paste("Value")) +
layer(data=df,
mapping=aes(x=MEASURE_NAMES, y=MEASURE_VALUES),
stat="identity",
stat_params=list(),
geom="bar",
geom_params=list(colour="blue"),
position=position_identity()
) + coord_flip() +
layer(data=df,
mapping=aes(x=MEASURE_NAMES, y=MEASURE_VALUES, label=round(MEASURE_VALUES)),
stat="identity",
stat_params=list(),
geom="text",
geom_params=list(colour="black", hjust=-0.5),
position=position_identity()
)