---
#This code aggregating Sensor Data in
Intersection, which are obtained from VDOT.
#Data is
for may2018, a file for each day in CSV format with 15-minute intervals.
#The
first problem is data doesn't have direction info, it needed to load separately
to match with this data . Also
we need certain sensors, so just they need to be extracted
#Another
problem is some hours in some sensors are missing, it makes aggregation wrong,
so we need data imputation to replace missing values with nearest time
# There
was a request to have a customizable code to aggregate data for different
interval times(1 hour, $hour, 30 min,..)
#final
product is a CSV file with selected days and selected sensors, with selected
time interval.(cleaned and ready for further use)
#loading
needed libraries
library(dplyr)
library(readxl)
library(plyr)
library(lubridate)
library(zoo)
# reading
file that have direction infor for each detector
direct<-read_excel("siteanddetector_8-26-19.xlsx",1)
dirc<-direct$DETECTOR_ID
test<-paste(dirc,
collapse = "$|^")
test2<-paste("^",test,"$",
sep="")
#Below is
a beginging of a loop which has sub-loops to aggergate data
date<-c(1:30) ## DAY Range that shoudl
be defined , if just one specfic date Wanted put that
specific date
#getting
list of the files in directory ( a csv file for each
day)
filenames <- list.files(
pattern="*.csv", full.names=TRUE)
name<-substring(filenames[1],3,21)
blank = read.table(name,
sep = "," ,header = FALSE, nrows=1)
blank<-blank[,c(1,2,4)]
a<-blank[0,]
for (m in date){
#opens a day
name<-substring(filenames[m],3,21)
data = read.table(name, sep = "," ,header = FALSE)
dat<-data[,c(1,2,4)]
t<-filter(dat, grepl(test2,dat$V1))
#time
manipulation
t$V2<-as.POSIXlt(t$V2,"EST")
t$V2<-floor_date(t$V2,
"15 minutes")
#below is
efforts to replace missing time slots
q<-data.frame(V1=dirc[1],V2=seq(from = t[1,2], length.out
= 96, by = 900))
q<-q[0,]
for (ty in 1:length(dirc)){
re<-data.frame(V1=dirc[ty],V2=seq(from
= t[1,2], length.out = 96, by = 900))
q<-rbind(q,re)
}
df1<-merge(x=q,y=t,
by=c("V1","V2"), all.x=TRUE)
df1$V5<-na.locf(df1$V4) # replacing NA values with nearest
value
colnames(df1)
<- c("DETECTOR_ID", "DateTime","Vol", "CorrectedVol")
df<-merge(x=df1,y=direct,by="DETECTOR_ID",all.x=TRUE)
play<-ddply(df,
.(DEVICE_ID,DateTime,SITE_NAME, Latitude,Longitude,
Direction), summarise, Vol=sum(CorrectedVol))
#detecting weekday
for (r in 1:nrow(play)){play[r,8]<-weekdays(as.Date(play[r,2],"EST"))}
colnames(play)
<- c("SITE_ID", "DateTime","SITE_NAME",
"Latitude","Longitude","Direction","Vol","Weekdays")
#Time aggregation loop
p=4 #step
size for R loop!!!!!!!!!!!!!!!!!!! #parameter that should be customized by user
"2" for half hour "4" for a full hour
my <- play[order(play$SITE_ID, play$Direction,play$DateTime),]
mydata1<-my[0,]
for (i in seq(1,nrow(my),p)){
mydata1[i,1:6]<- my[i,1:6]
mydata1[i,8]<- my[i,8]
mydata1[i,7] <-
sum(my[i:(i+p-1),7])}
mydata1<-na.omit(mydata1)
a<-rbind(a, mydata1)
remove(play)
remove(mydata1)
print(paste("day",m))
#generated file for each day
write.csv(a,
file = paste('Generatednew/',m,'th-day-selected.csv',
sep=""), sep=",")
}
#Alldays Data for 1 hour step
write.csv(a, file = 'Generatednew/SelectedDevices.csv', sep=",")