Ozone Production - Data Download and Cleaning

This post will be an exercise in environmental epidemiology. I didn’t have a specific research question in mind. I wanted to find a environmental dataset and extract any information from it that I could. I chose a dataset available on the UCI website [UCI data] containing 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device. The device was located on the field in a significantly polluted area, at road level,within an Italian city. Data were recorded from March 2004 to February 2005. 
First, load the necessary libraries:
library(tidyverse)
library(lubridate)
library(stringr)
library(anytime)
library(readr)
library(MASS)
library(nlme)
library(GGally)
library(MatchIt)
library(Matching)
library(psych)
library(tableone)

Data Download and Cleaning:

Download the data from the UCI website to your working directory, unzip it and store it as “air”. 
Attribute Information:
0 Date (DD/MM/YYYY)
1 Time (HH.MM.SS)
2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)
3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)
5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
7 True hourly averaged NOx concentration in ppb (reference analyzer)
8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
12 Temperature in °C
13 Relative Humidity (%)
14 AH Absolute Humidity 
set.seed(123)
air <- read_delim("AirQualityUCI.csv",
                  ";",
                  escape_double = FALSE,
                  trim_ws = TRUE)
air <- air[, -c(16:17)]
#easier names to work with
colnames(air) <- c("date", "time", "cogt", "copt",
                    "nmhc", "benz", "nmhcpt", "noxgt",
                    "noxpt", "no2", "no2pt", "o3",
                   "tmp", "rh", "ah")
glimpse(air)                    
## Observations: 9,471
## Variables: 15
## $ date   <chr> "10/03/2004", "10/03/2004", "10/03/2004", "10/03/2004",...
## $ time   <chr> "18.00.00", "19.00.00", "20.00.00", "21.00.00", "22.00....
## $ cogt   <chr> "2,6", "2", "2,2", "2,2", "1,6", "1,2", "1,2", "1", "0,...
## $ copt   <int> 1360, 1292, 1402, 1376, 1272, 1197, 1185, 1136, 1094, 1...
## $ nmhc   <int> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 29...
## $ benz   <chr> "11,9", "9,4", "9,0", "9,2", "6,5", "4,7", "3,6", "3,3"...
## $ nmhcpt <int> 1046, 955, 939, 948, 836, 750, 690, 672, 609, 561, 527,...
## $ noxgt  <int> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16, ...
## $ noxpt  <int> 1056, 1174, 1140, 1092, 1205, 1337, 1462, 1453, 1579, 1...
## $ no2    <int> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, 4...
## $ no2pt  <int> 1692, 1559, 1555, 1584, 1490, 1393, 1333, 1333, 1276, 1...
## $ o3     <int> 1268, 972, 1074, 1203, 1110, 949, 733, 730, 620, 501, 4...
## $ tmp    <dbl> 136, 133, 119, 110, 112, 112, 113, 107, 107, 103, 101, ...
## $ rh     <dbl> 489, 477, 540, 600, 596, 592, 568, 600, 597, 602, 605, ...
## $ ah     <chr> "0,7578", "0,7255", "0,7502", "0,7867", "0,7888", "0,78...
#remove  section of all NA's
air <- air[1:9357, ]
air <- air[, -5] #too many -220's(NA) in this column
#fix the dates from d/m/y to m-d-y
date <- air$date
#date[55]
day <- str_sub(date,1,2)
#day[55]
#date[55]
str_sub(date, 1,2) <- str_sub(date, 4,5)
#date[55]
str_sub(date, 4,5) <- day
#date[55]
air$date <- date
air$date <- anydate(air$date)

air <- air %>% mutate(datetime = paste0(date," ", time))
air$datetime <- anytime(air$datetime)
Prepare each variable for use in a tidy dataset: Starting with carbon monoxide(CO) I will use random imputation for imputation of missing values [See description of random imputation]
#cogt
#change commas to periods
air$cogt <- as.numeric(gsub(",", "\\.", air$cogt))
summary(air$cogt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -200.00    0.60    1.50  -34.21    2.60   11.90
#change -200's to NA's
air$cogt <- ifelse(air$cogt == -200, NA, air$cogt)
summary(air$cogt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.100   1.100   1.800   2.153   2.900  11.900    1683
#visualize the data and its log trransformation
ggplot(air) +
  geom_density(aes(x = cogt)) 
ggplot(air) +
  geom_density(aes(x = log(cogt))) #looks more normal
#log transformation looks more normally distributed
air$lcogt <- log(air$cogt)

#find the parameters to use for random imputation of NA's
std <- sd(air$lcogt, na.rm = TRUE) 
mn <- mean(air$lcogt, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lcogt)))

#Random impute the NA's
air$lcogt <- ifelse(is.na(air$lcogt), rnorm(1, mn, std), air$lcogt) 

#see what we have
summary(air$lcogt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.3026  0.1216  0.4055  0.4573  0.9555  2.4765

********************************************************


Next variable:
#copt
summary(air$copt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -200     921    1053    1049    1221    2040
air$copt <- ifelse(air$copt == -200, NA, air$copt)
summary(air$copt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     647     937    1063    1100    1231    2040     366
ggplot(air) +
  geom_density(aes(x = copt)) 
ggplot(air) +
  geom_density(aes(x = log(copt))) #looks more normal
air$lcopt <- log(air$copt)

std <- sd(air$lcopt, na.rm = TRUE) 
mn <- mean(air$lcopt, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lcopt)))

air$lcopt <- ifelse(is.na(air$lcopt), rnorm(1, mn, std), air$lcopt) 

summary(air$lcopt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.472   6.847   6.959   6.983   7.107   7.621

********************************************************


Next variable:
#benz
air$benz <- as.numeric(gsub(",", "\\.", air$benz))

summary(air$benz)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -200.000    4.000    7.900    1.866   13.600   63.700
air$benz <- ifelse(air$benz == -200, NA, air$benz)
summary(air$benz)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.10    4.40    8.20   10.08   14.00   63.70     366
ggplot(air) +
  geom_density(aes(x = benz))+
  xlim(0,5)
ggplot(air) +
  geom_density(aes(x = log(benz)))+
  xlim(0,5)
air$lbenz <- log(air$benz)

std <- sd(air$lbenz, na.rm = TRUE) 
mn <- mean(air$lbenz, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lbenz)))

air$lbenz <- ifelse(is.na(air$lbenz), rnorm(1, mn, std), air$lbenz) 
summary(air$lbenz)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.303   1.526   2.152   2.065   2.695   4.154

********************************************************


Next variable:
#nmhcpt
summary(air$nmhcpt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -200.0   711.0   895.0   894.6  1105.0  2214.0
air$nmhcpt <- ifelse(air$nmhcpt == -200, NA, air$nmhcpt)
summary(air$nmhcpt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   383.0   734.5   909.0   939.2  1116.0  2214.0     366
ggplot(air) +
  geom_density(aes(x = nmhcpt))+
  xlim(5,1000)
ggplot(air) +
  geom_density(aes(x = log(nmhcpt)))+
  xlim(5,8)
air$lnmhcpt <- log(air$nmhcpt)

std <- sd(air$lnmhcpt, na.rm = TRUE)
mn <- mean(air$lnmhcpt, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lnmhcpt)))

air$lnmhcpt <- ifelse(is.na(air$lnmhcpt), rnorm(1, mn, std), air$lnmhcpt) 
summary(air$lnmhcpt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.948   6.611   6.825   6.805   7.008   7.703

********************************************************


Next variable:
#noxgt
summary(air$noxgt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -200.0    50.0   141.0   168.6   284.0  1479.0
air$noxgt <- ifelse(air$noxgt == -200, NA, air$noxgt)

ggplot(air) +
  geom_density(aes(x = noxgt)) +
  xlim(0,800)
ggplot(air) +
  geom_density(aes(x = log(noxgt)))+
  xlim(0,10)
air$lnoxgt <- log(air$noxgt)

std <- sd(air$lnoxgt, na.rm = TRUE)
mn <- mean(air$lnoxgt, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lnoxgt)))

air$lnoxgt <- ifelse(is.na(air$lnoxgt), rnorm(1, mn, std), air$lnoxgt) 
summary(air$lnoxgt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6931  4.7185  5.2686  5.1743  5.6490  7.2991

********************************************************


Next variable:
#noxpt
summary(air$noxpt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -200     637     794     795     960    2683
air$noxpt <- ifelse(air$noxpt == -200, NA, air$noxpt)
summary(air$noxpt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   322.0   658.0   806.0   835.5   969.5  2683.0     366
ggplot(air) +
  geom_density(aes(x = noxpt)) +
  xlim(250,1800)
ggplot(air) +
  geom_density(aes(x = log(noxpt)))+
  xlim(5,8)
air$lnoxpt <- log(air$noxpt)

std <- sd(air$lnoxpt, na.rm = TRUE)
mn <- mean(air$lnoxpt, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lnoxpt)))

air$lnoxpt <- ifelse(is.na(air$lnoxpt), rnorm(1, mn, std), air$lnoxpt) 
summary(air$lnoxpt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.775   6.501   6.707   6.703   6.904   7.895

********************************************************


Next variable:
#no2
summary(air$no2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -200.00   53.00   96.00   58.15  133.00  340.00
air$no2 <- ifelse(air$no2 == -200, NA, air$no2)

ggplot(air) +
  geom_density(aes(x = no2)) +
  xlim(250,1800)
ggplot(air) +
  geom_density(aes(x = log(no2)))+
  xlim(5,8)
#The log transformation looks like a gamma distribution so wqe will use a gamma for imputation
air$no2 <- ifelse(is.na(air$no2), rgamma(1, shape = 1, rate = 1), air$no2) 
summary(air$no2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.879  53.000  96.000  93.400 133.000 340.000

********************************************************


Next variable:
#no2pt
summary(air$no2pt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -200    1185    1446    1391    1662    2775
air$no2pt <- ifelse(air$no2pt == -200, NA, air$no2pt)
summary(air$no2pt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     551    1227    1463    1456    1674    2775     366
ggplot(air) +
  geom_density(aes(x = no2pt)) +
  xlim(250,2500)
ggplot(air) +
  geom_density(aes(x = log(no2pt))) +
  xlim(6, 8)
air$lno2pt <- log(air$no2pt)

std <- sd(air$lno2pt, na.rm = TRUE)
mn <- mean(air$lno2pt, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lnoxpt)))

air$lno2pt <- ifelse(is.na(air$lno2pt), rnorm(1, mn, std), air$lno2pt) 
summary(air$lno2pt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.312   7.077   7.277   7.242   7.416   7.928

********************************************************


Next variable:
#o3
summary(air$o3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -200.0   700.0   942.0   975.1  1255.0  2523.0
air$o3 <- ifelse(air$o3  == -200, NA, air$o3 )
summary(air$o3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   221.0   731.5   963.0  1022.9  1273.5  2523.0     366
ggplot(air) +
  geom_density(aes(x = o3)) +
  xlim(250,1800)
ggplot(air) +
  geom_density(aes(x = log(o3))) +
  xlim(6, 8)
air$lo3 <- log(air$o3)

std <- sd(air$lo3, na.rm = TRUE)
mn <- mean(air$lo3, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lo3)))

air$lo3 <- ifelse(is.na(air$lo3), rnorm(1, mn, std), air$lo3) 
summary(air$lo3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.398   6.573   6.848   6.841   7.135   7.833

********************************************************


Next variable:
#tmp
summary(air$tmp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -200.0   109.0   172.0   168.2   241.0   446.0
air$tmp <- ifelse(air$tmp  == -200, NA, air$tmp )
summary(air$tmp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -19.0   118.0   178.0   183.2   244.0   446.0     366
ggplot(air) +
  geom_density(aes(x = tmp)) +
  xlim(-50, 500)
ggplot(air) +
  geom_density(aes(x = log(tmp))) +
  xlim(1.0, 6.5)
#first one looks more normal
air$tmp <- ifelse(is.na(air$tmp), rnorm(1, mn, std), air$tmp)

std <- sd(air$tmp, na.rm = TRUE)
mn <- mean(air$tmp, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$tmp)))
air$ltmp <- log(air$tmp)
air$ltmp <- ifelse(is.na(air$ltmp), rnorm(1, mn, std), air$ltmp)

summary(air$tmp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -19.0   109.0   172.0   176.3   241.0   446.0

********************************************************


Next variable:
#rh
summary(air$rh)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -200.0   341.0   486.0   465.3   619.0   887.0
air$rh<- ifelse(air$rh  == -200, NA, air$rh )
summary(air$rh)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    92.0   358.0   496.0   492.3   625.0   887.0     366
ggplot(air) +
  geom_density(aes(x = rh)) +
  xlim(-50, 1000)
ggplot(air) +
  geom_density(aes(x = log(rh))) +
  xlim(4, 10)
#first one looks more normal

air$rh<- ifelse(is.na(air$rh), rnorm(1, mn, std), air$rh)
#no log transform needed here
summary(air$rh)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    92.0   341.0   486.0   481.3   619.0   887.0

********************************************************


Next variable:
#ah
air$ah <- as.numeric(gsub(",", "\\.", air$ah))

summary(air$ah)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -200.0000    0.6923    0.9768   -6.8376    1.2962    2.2310
air$ah<- ifelse(air$ah  == -200, NA, air$ah )
summary(air$ah)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.1847  0.7368  0.9954  1.0255  1.3137  2.2310     366
ggplot(air) +
  geom_density(aes(x = ah)) +
  xlim(0, 2.5)
ggplot(air) +
  geom_density(aes(x = log(ah))) +
  xlim(-1,1)
air$lah <- log(air$ah)

std <- sd(air$lah, na.rm = TRUE)
mn <- mean(air$lah, na.rm = TRUE) 
sm <- sum(as.numeric(is.na(air$lah)))

air$lah<- ifelse(is.na(air$lah), rnorm(1, mean = mn, std), air$lah)

summary(air$lah)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.68902 -0.29290  0.01528 -0.05716  0.25944  0.80245

********************************************************


Now, let’s keep the transformed variables and rename it “air1”
air1 <- air %>% dplyr::select(
  datetime, rh, tmp, no2, lcogt, lcopt, lbenz, lnmhcpt,
  lnoxgt, lnoxpt, lno2pt, lo3, lah)
glimpse(air1)
## Observations: 9,357
## Variables: 13
## $ datetime <dttm> 2004-03-10 18:00:00, 2004-03-10 19:00:00, 2004-03-10...
## $ rh       <dbl> 489, 477, 540, 600, 596, 592, 568, 600, 597, 602, 605...
## $ tmp      <dbl> 136, 133, 119, 110, 112, 112, 113, 107, 107, 103, 101...
## $ no2      <dbl> 113.0000000, 92.0000000, 114.0000000, 122.0000000, 11...
## $ lcogt    <dbl> 0.95551145, 0.69314718, 0.78845736, 0.78845736, 0.470...
## $ lcopt    <dbl> 7.215240, 7.163947, 7.245655, 7.226936, 7.148346, 7.0...
## $ lbenz    <dbl> 2.47653840, 2.24070969, 2.19722458, 2.21920348, 1.871...
## $ lnmhcpt  <dbl> 6.952729, 6.861711, 6.844815, 6.854355, 6.728629, 6.6...
## $ lnoxgt   <dbl> 5.111988, 4.634729, 4.875197, 5.147494, 4.875197, 4.4...
## $ lnoxpt   <dbl> 6.962243, 7.068172, 7.038784, 6.995766, 7.094235, 7.1...
## $ lno2pt   <dbl> 7.433667, 7.351800, 7.349231, 7.367709, 7.306531, 7.2...
## $ lo3      <dbl> 7.145196, 6.879356, 6.979145, 7.092574, 7.012115, 6.8...
## $ lah      <dbl> -0.2773358, -0.3208942, -0.2874154, -0.2399083, -0.23...

********************************************************


Trim the ends of the time series and rename: “air2”
#make the time series start at 12:00 midnight and end also
air2 <- air1[-c(1:6), ]
air2 <- air2[1:9336, ]
air2$day <- rep(1:389, each = 24)

air2$date <- factor(anydate(air2$datetime))
air2$month <- factor(month(air2$date))
air2$dayofweek <- factor(wday(air2$date, label = TRUE))

********************************************************


We will center and scale the numeric variables
#center and scale the numerical data
air3 <- air2
air3 <- air3 %>%
  mutate_at(.vars = vars(rh, tmp, no2, lcogt, lcopt, lbenz,    
                         lnmhcpt, lnoxgt, lnoxpt,   
                         lno2pt, lo3, lah),
            .funs = funs(scale))

********************************************************

Please see the next page for data exploration and analysis.

No comments:

Post a Comment