---

#This code aggregating Sensor Data in Intersection, which are obtained from VDOT.

#Data is for may2018, a file for each day in CSV format with 15-minute intervals.

#The first problem is data doesn't have direction info, it needed to load separately to match with this data . Also we need certain sensors, so just they need to be extracted

#Another problem is some hours in some sensors are missing, it makes aggregation wrong, so we need data imputation to replace missing values with nearest time

# There was a request to have a customizable code to aggregate data for different interval times(1 hour, $hour, 30 min,..)

#final product is a CSV file with selected days and selected sensors, with selected time interval.(cleaned and ready for further use)

#loading needed libraries

library(dplyr)

library(readxl)

library(plyr)

library(lubridate)

library(zoo)

# reading file that have direction infor for each detector

direct<-read_excel("siteanddetector_8-26-19.xlsx",1)

dirc<-direct$DETECTOR_ID

test<-paste(dirc, collapse = "$|^")

test2<-paste("^",test,"$", sep="")

#Below is a beginging of a loop which has sub-loops to aggergate data

date<-c(1:30) ## DAY Range that shoudl be defined , if just one specfic date Wanted put that specific date

#getting list of the files in directory ( a csv file for each day)

filenames <- list.files( pattern="*.csv", full.names=TRUE)

name<-substring(filenames[1],3,21)

blank = read.table(name, sep = "," ,header = FALSE, nrows=1)

blank<-blank[,c(1,2,4)]

a<-blank[0,]

for (m in date){

#opens a day

name<-substring(filenames[m],3,21)

data = read.table(name, sep = "," ,header = FALSE)

dat<-data[,c(1,2,4)]

t<-filter(dat, grepl(test2,dat$V1))

#time manipulation

t$V2<-as.POSIXlt(t$V2,"EST")

t$V2<-floor_date(t$V2, "15 minutes")

#below is efforts to replace missing time slots

q<-data.frame(V1=dirc[1],V2=seq(from = t[1,2], length.out = 96, by = 900))

q<-q[0,]

for (ty in 1:length(dirc)){

re<-data.frame(V1=dirc[ty],V2=seq(from = t[1,2], length.out = 96, by = 900))

q<-rbind(q,re)

}

df1<-merge(x=q,y=t, by=c("V1","V2"), all.x=TRUE)

df1$V5<-na.locf(df1$V4) # replacing NA values with nearest value

colnames(df1) <- c("DETECTOR_ID", "DateTime","Vol", "CorrectedVol")

df<-merge(x=df1,y=direct,by="DETECTOR_ID",all.x=TRUE)

play<-ddply(df, .(DEVICE_ID,DateTime,SITE_NAME, Latitude,Longitude, Direction), summarise, Vol=sum(CorrectedVol))

#detecting weekday

for (r in 1:nrow(play)){play[r,8]<-weekdays(as.Date(play[r,2],"EST"))}

colnames(play) <- c("SITE_ID", "DateTime","SITE_NAME", "Latitude","Longitude","Direction","Vol","Weekdays")

#Time aggregation loop

p=4 #step size for R loop!!!!!!!!!!!!!!!!!!! #parameter that should be customized by user "2" for half hour "4" for a full hour

my <- play[order(play$SITE_ID, play$Direction,play$DateTime),]

mydata1<-my[0,]

for (i in seq(1,nrow(my),p)){

mydata1[i,1:6]<- my[i,1:6]

mydata1[i,8]<- my[i,8]

mydata1[i,7] <- sum(my[i:(i+p-1),7])}

mydata1<-na.omit(mydata1)

a<-rbind(a, mydata1)

remove(play)

remove(mydata1)

print(paste("day",m))

#generated file for each day

write.csv(a, file = paste('Generatednew/',m,'th-day-selected.csv', sep=""), sep=",")

}

#Alldays Data for 1 hour step

write.csv(a, file = 'Generatednew/SelectedDevices.csv', sep=",")