Ozone Production
David P. Nichols, MD, MPH
August 1, 2018
This post will be an exercise in environmental epidemiology. I didn’t have a specific research question in mind. I wanted to find a environmental dataset and extract any information from it that I could. I chose a dataset available on the UCI website [UCI data] containing 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device. The device was located on the field in a significantly polluted area, at road level,within an Italian city. Data were recorded from March 2004 to February 2005.
First, load the necessary libraries:
library(tidyverse)
library(lubridate)
library(stringr)
library(anytime)
library(readr)
library(MASS)
library(nlme)
library(GGally)
library(MatchIt)
library(Matching)
library(psych)
library(tableone)
Data Download and Cleaning:
Download the data from the UCI website to your working directory, unzip it and store it as “air”.
Attribute Information:
0 Date (DD/MM/YYYY)
1 Time (HH.MM.SS)
2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)
3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)
5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
7 True hourly averaged NOx concentration in ppb (reference analyzer)
8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
12 Temperature in °C
13 Relative Humidity (%)
14 AH Absolute Humidity
0 Date (DD/MM/YYYY)
1 Time (HH.MM.SS)
2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)
3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)
5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
7 True hourly averaged NOx concentration in ppb (reference analyzer)
8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
12 Temperature in °C
13 Relative Humidity (%)
14 AH Absolute Humidity
set.seed(123)
air <- read_delim("AirQualityUCI.csv",
";",
escape_double = FALSE,
trim_ws = TRUE)
air <- air[, -c(16:17)]
#easier names to work with
colnames(air) <- c("date", "time", "cogt", "copt",
"nmhc", "benz", "nmhcpt", "noxgt",
"noxpt", "no2", "no2pt", "o3",
"tmp", "rh", "ah")
glimpse(air)
## Observations: 9,471
## Variables: 15
## $ date <chr> "10/03/2004", "10/03/2004", "10/03/2004", "10/03/2004",...
## $ time <chr> "18.00.00", "19.00.00", "20.00.00", "21.00.00", "22.00....
## $ cogt <chr> "2,6", "2", "2,2", "2,2", "1,6", "1,2", "1,2", "1", "0,...
## $ copt <int> 1360, 1292, 1402, 1376, 1272, 1197, 1185, 1136, 1094, 1...
## $ nmhc <int> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 29...
## $ benz <chr> "11,9", "9,4", "9,0", "9,2", "6,5", "4,7", "3,6", "3,3"...
## $ nmhcpt <int> 1046, 955, 939, 948, 836, 750, 690, 672, 609, 561, 527,...
## $ noxgt <int> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16, ...
## $ noxpt <int> 1056, 1174, 1140, 1092, 1205, 1337, 1462, 1453, 1579, 1...
## $ no2 <int> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, 4...
## $ no2pt <int> 1692, 1559, 1555, 1584, 1490, 1393, 1333, 1333, 1276, 1...
## $ o3 <int> 1268, 972, 1074, 1203, 1110, 949, 733, 730, 620, 501, 4...
## $ tmp <dbl> 136, 133, 119, 110, 112, 112, 113, 107, 107, 103, 101, ...
## $ rh <dbl> 489, 477, 540, 600, 596, 592, 568, 600, 597, 602, 605, ...
## $ ah <chr> "0,7578", "0,7255", "0,7502", "0,7867", "0,7888", "0,78...
#remove section of all NA's
air <- air[1:9357, ]
air <- air[, -5] #too many -220's(NA) in this column
#fix the dates from d/m/y to m-d-y
date <- air$date
#date[55]
day <- str_sub(date,1,2)
#day[55]
#date[55]
str_sub(date, 1,2) <- str_sub(date, 4,5)
#date[55]
str_sub(date, 4,5) <- day
#date[55]
air$date <- date
air$date <- anydate(air$date)
air <- air %>% mutate(datetime = paste0(date," ", time))
air$datetime <- anytime(air$datetime)
Prepare each variable for use in a tidy dataset: Starting with carbon monoxide(CO) I will use random imputation for imputation of missing values [See description of random imputation]
#cogt
#change commas to periods
air$cogt <- as.numeric(gsub(",", "\\.", air$cogt))
summary(air$cogt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.00 0.60 1.50 -34.21 2.60 11.90
#change -200's to NA's
air$cogt <- ifelse(air$cogt == -200, NA, air$cogt)
summary(air$cogt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.100 1.100 1.800 2.153 2.900 11.900 1683
#visualize the data and its log trransformation
ggplot(air) +
geom_density(aes(x = cogt))
ggplot(air) +
geom_density(aes(x = log(cogt))) #looks more normal
#log transformation looks more normally distributed
air$lcogt <- log(air$cogt)
#find the parameters to use for random imputation of NA's
std <- sd(air$lcogt, na.rm = TRUE)
mn <- mean(air$lcogt, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lcogt)))
#Random impute the NA's
air$lcogt <- ifelse(is.na(air$lcogt), rnorm(1, mn, std), air$lcogt)
#see what we have
summary(air$lcogt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3026 0.1216 0.4055 0.4573 0.9555 2.4765
********************************************************
Next variable:
#copt
summary(air$copt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200 921 1053 1049 1221 2040
air$copt <- ifelse(air$copt == -200, NA, air$copt)
summary(air$copt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 647 937 1063 1100 1231 2040 366
ggplot(air) +
geom_density(aes(x = copt))
ggplot(air) +
geom_density(aes(x = log(copt))) #looks more normal
air$lcopt <- log(air$copt)
std <- sd(air$lcopt, na.rm = TRUE)
mn <- mean(air$lcopt, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lcopt)))
air$lcopt <- ifelse(is.na(air$lcopt), rnorm(1, mn, std), air$lcopt)
summary(air$lcopt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.472 6.847 6.959 6.983 7.107 7.621
********************************************************
Next variable:
#benz
air$benz <- as.numeric(gsub(",", "\\.", air$benz))
summary(air$benz)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.000 4.000 7.900 1.866 13.600 63.700
air$benz <- ifelse(air$benz == -200, NA, air$benz)
summary(air$benz)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.10 4.40 8.20 10.08 14.00 63.70 366
ggplot(air) +
geom_density(aes(x = benz))+
xlim(0,5)
ggplot(air) +
geom_density(aes(x = log(benz)))+
xlim(0,5)
air$lbenz <- log(air$benz)
std <- sd(air$lbenz, na.rm = TRUE)
mn <- mean(air$lbenz, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lbenz)))
air$lbenz <- ifelse(is.na(air$lbenz), rnorm(1, mn, std), air$lbenz)
summary(air$lbenz)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.303 1.526 2.152 2.065 2.695 4.154
********************************************************
Next variable:
#nmhcpt
summary(air$nmhcpt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.0 711.0 895.0 894.6 1105.0 2214.0
air$nmhcpt <- ifelse(air$nmhcpt == -200, NA, air$nmhcpt)
summary(air$nmhcpt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 383.0 734.5 909.0 939.2 1116.0 2214.0 366
ggplot(air) +
geom_density(aes(x = nmhcpt))+
xlim(5,1000)
ggplot(air) +
geom_density(aes(x = log(nmhcpt)))+
xlim(5,8)
air$lnmhcpt <- log(air$nmhcpt)
std <- sd(air$lnmhcpt, na.rm = TRUE)
mn <- mean(air$lnmhcpt, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lnmhcpt)))
air$lnmhcpt <- ifelse(is.na(air$lnmhcpt), rnorm(1, mn, std), air$lnmhcpt)
summary(air$lnmhcpt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.948 6.611 6.825 6.805 7.008 7.703
********************************************************
Next variable:
#noxgt
summary(air$noxgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.0 50.0 141.0 168.6 284.0 1479.0
air$noxgt <- ifelse(air$noxgt == -200, NA, air$noxgt)
ggplot(air) +
geom_density(aes(x = noxgt)) +
xlim(0,800)
ggplot(air) +
geom_density(aes(x = log(noxgt)))+
xlim(0,10)
air$lnoxgt <- log(air$noxgt)
std <- sd(air$lnoxgt, na.rm = TRUE)
mn <- mean(air$lnoxgt, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lnoxgt)))
air$lnoxgt <- ifelse(is.na(air$lnoxgt), rnorm(1, mn, std), air$lnoxgt)
summary(air$lnoxgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6931 4.7185 5.2686 5.1743 5.6490 7.2991
********************************************************
Next variable:
#noxpt
summary(air$noxpt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200 637 794 795 960 2683
air$noxpt <- ifelse(air$noxpt == -200, NA, air$noxpt)
summary(air$noxpt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 322.0 658.0 806.0 835.5 969.5 2683.0 366
ggplot(air) +
geom_density(aes(x = noxpt)) +
xlim(250,1800)
ggplot(air) +
geom_density(aes(x = log(noxpt)))+
xlim(5,8)
air$lnoxpt <- log(air$noxpt)
std <- sd(air$lnoxpt, na.rm = TRUE)
mn <- mean(air$lnoxpt, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lnoxpt)))
air$lnoxpt <- ifelse(is.na(air$lnoxpt), rnorm(1, mn, std), air$lnoxpt)
summary(air$lnoxpt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.775 6.501 6.707 6.703 6.904 7.895
********************************************************
Next variable:
#no2
summary(air$no2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.00 53.00 96.00 58.15 133.00 340.00
air$no2 <- ifelse(air$no2 == -200, NA, air$no2)
ggplot(air) +
geom_density(aes(x = no2)) +
xlim(250,1800)
ggplot(air) +
geom_density(aes(x = log(no2)))+
xlim(5,8)
#The log transformation looks like a gamma distribution so wqe will use a gamma for imputation
air$no2 <- ifelse(is.na(air$no2), rgamma(1, shape = 1, rate = 1), air$no2)
summary(air$no2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.879 53.000 96.000 93.400 133.000 340.000
********************************************************
Next variable:
#no2pt
summary(air$no2pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200 1185 1446 1391 1662 2775
air$no2pt <- ifelse(air$no2pt == -200, NA, air$no2pt)
summary(air$no2pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 551 1227 1463 1456 1674 2775 366
ggplot(air) +
geom_density(aes(x = no2pt)) +
xlim(250,2500)
ggplot(air) +
geom_density(aes(x = log(no2pt))) +
xlim(6, 8)
air$lno2pt <- log(air$no2pt)
std <- sd(air$lno2pt, na.rm = TRUE)
mn <- mean(air$lno2pt, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lnoxpt)))
air$lno2pt <- ifelse(is.na(air$lno2pt), rnorm(1, mn, std), air$lno2pt)
summary(air$lno2pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.312 7.077 7.277 7.242 7.416 7.928
********************************************************
Next variable:
#o3
summary(air$o3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.0 700.0 942.0 975.1 1255.0 2523.0
air$o3 <- ifelse(air$o3 == -200, NA, air$o3 )
summary(air$o3)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 221.0 731.5 963.0 1022.9 1273.5 2523.0 366
ggplot(air) +
geom_density(aes(x = o3)) +
xlim(250,1800)
ggplot(air) +
geom_density(aes(x = log(o3))) +
xlim(6, 8)
air$lo3 <- log(air$o3)
std <- sd(air$lo3, na.rm = TRUE)
mn <- mean(air$lo3, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lo3)))
air$lo3 <- ifelse(is.na(air$lo3), rnorm(1, mn, std), air$lo3)
summary(air$lo3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.398 6.573 6.848 6.841 7.135 7.833
********************************************************
Next variable:
#tmp
summary(air$tmp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.0 109.0 172.0 168.2 241.0 446.0
air$tmp <- ifelse(air$tmp == -200, NA, air$tmp )
summary(air$tmp)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -19.0 118.0 178.0 183.2 244.0 446.0 366
ggplot(air) +
geom_density(aes(x = tmp)) +
xlim(-50, 500)
ggplot(air) +
geom_density(aes(x = log(tmp))) +
xlim(1.0, 6.5)
#first one looks more normal
air$tmp <- ifelse(is.na(air$tmp), rnorm(1, mn, std), air$tmp)
std <- sd(air$tmp, na.rm = TRUE)
mn <- mean(air$tmp, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$tmp)))
air$ltmp <- log(air$tmp)
air$ltmp <- ifelse(is.na(air$ltmp), rnorm(1, mn, std), air$ltmp)
summary(air$tmp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -19.0 109.0 172.0 176.3 241.0 446.0
********************************************************
Next variable:
#rh
summary(air$rh)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.0 341.0 486.0 465.3 619.0 887.0
air$rh<- ifelse(air$rh == -200, NA, air$rh )
summary(air$rh)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 92.0 358.0 496.0 492.3 625.0 887.0 366
ggplot(air) +
geom_density(aes(x = rh)) +
xlim(-50, 1000)
ggplot(air) +
geom_density(aes(x = log(rh))) +
xlim(4, 10)
#first one looks more normal
air$rh<- ifelse(is.na(air$rh), rnorm(1, mn, std), air$rh)
#no log transform needed here
summary(air$rh)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.0 341.0 486.0 481.3 619.0 887.0
********************************************************
Next variable:
#ah
air$ah <- as.numeric(gsub(",", "\\.", air$ah))
summary(air$ah)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.0000 0.6923 0.9768 -6.8376 1.2962 2.2310
air$ah<- ifelse(air$ah == -200, NA, air$ah )
summary(air$ah)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1847 0.7368 0.9954 1.0255 1.3137 2.2310 366
ggplot(air) +
geom_density(aes(x = ah)) +
xlim(0, 2.5)
ggplot(air) +
geom_density(aes(x = log(ah))) +
xlim(-1,1)
air$lah <- log(air$ah)
std <- sd(air$lah, na.rm = TRUE)
mn <- mean(air$lah, na.rm = TRUE)
sm <- sum(as.numeric(is.na(air$lah)))
air$lah<- ifelse(is.na(air$lah), rnorm(1, mean = mn, std), air$lah)
summary(air$lah)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.68902 -0.29290 0.01528 -0.05716 0.25944 0.80245
********************************************************
Now, let’s keep the transformed variables and rename it “air1”
air1 <- air %>% dplyr::select(
datetime, rh, tmp, no2, lcogt, lcopt, lbenz, lnmhcpt,
lnoxgt, lnoxpt, lno2pt, lo3, lah)
glimpse(air1)
## Observations: 9,357
## Variables: 13
## $ datetime <dttm> 2004-03-10 18:00:00, 2004-03-10 19:00:00, 2004-03-10...
## $ rh <dbl> 489, 477, 540, 600, 596, 592, 568, 600, 597, 602, 605...
## $ tmp <dbl> 136, 133, 119, 110, 112, 112, 113, 107, 107, 103, 101...
## $ no2 <dbl> 113.0000000, 92.0000000, 114.0000000, 122.0000000, 11...
## $ lcogt <dbl> 0.95551145, 0.69314718, 0.78845736, 0.78845736, 0.470...
## $ lcopt <dbl> 7.215240, 7.163947, 7.245655, 7.226936, 7.148346, 7.0...
## $ lbenz <dbl> 2.47653840, 2.24070969, 2.19722458, 2.21920348, 1.871...
## $ lnmhcpt <dbl> 6.952729, 6.861711, 6.844815, 6.854355, 6.728629, 6.6...
## $ lnoxgt <dbl> 5.111988, 4.634729, 4.875197, 5.147494, 4.875197, 4.4...
## $ lnoxpt <dbl> 6.962243, 7.068172, 7.038784, 6.995766, 7.094235, 7.1...
## $ lno2pt <dbl> 7.433667, 7.351800, 7.349231, 7.367709, 7.306531, 7.2...
## $ lo3 <dbl> 7.145196, 6.879356, 6.979145, 7.092574, 7.012115, 6.8...
## $ lah <dbl> -0.2773358, -0.3208942, -0.2874154, -0.2399083, -0.23...
********************************************************
Trim the ends of the time series and rename: “air2”
#make the time series start at 12:00 midnight and end also
air2 <- air1[-c(1:6), ]
air2 <- air2[1:9336, ]
air2$day <- rep(1:389, each = 24)
air2$date <- factor(anydate(air2$datetime))
air2$month <- factor(month(air2$date))
air2$dayofweek <- factor(wday(air2$date, label = TRUE))
********************************************************
We will center and scale the numeric variables
#center and scale the numerical data
air3 <- air2
air3 <- air3 %>%
mutate_at(.vars = vars(rh, tmp, no2, lcogt, lcopt, lbenz,
lnmhcpt, lnoxgt, lnoxpt,
lno2pt, lo3, lah),
.funs = funs(scale))
********************************************************
Please see the next page for data exploration and analysis.
No comments:
Post a Comment