---

#This code aggregating Sensor Data in Intersection, which are obtained from VDOT.

#Data is for may2018, a file for each day in CSV format with 15-minute intervals.

#The first problem is data doesn't have direction info, it needed to load separately to match with this data . Also we need certain sensors, so just they need to be extracted

#Another problem is some hours in some sensors are missing, it makes aggregation wrong, so we need data imputation to replace missing values with nearest time

# There was a request to have a customizable code to aggregate data for different interval times(1 hour, $hour, 30 min,..)

#final product is a CSV file with selected days and selected sensors, with selected time interval.(cleaned and ready for further use)

 

#loading needed libraries

library(dplyr)

library(readxl)

library(plyr)

library(lubridate)

library(zoo)

 

# reading file that have direction infor for each detector

direct<-read_excel("siteanddetector_8-26-19.xlsx",1)

dirc<-direct$DETECTOR_ID

test<-paste(dirc, collapse = "$|^")

test2<-paste("^",test,"$", sep="")

 

#Below is a beginging of a loop which has sub-loops to aggergate data

 

date<-c(1:30)  ## DAY Range that shoudl be defined , if just one specfic date Wanted put that specific date

#getting list of the files in directory ( a csv file for each day)

filenames <- list.files( pattern="*.csv", full.names=TRUE)

name<-substring(filenames[1],3,21)

blank = read.table(name, sep = "," ,header = FALSE, nrows=1)

blank<-blank[,c(1,2,4)]

a<-blank[0,]

 

for (m in date){

  #opens a day

  name<-substring(filenames[m],3,21)

  data = read.table(name, sep = "," ,header = FALSE)

  dat<-data[,c(1,2,4)]

  t<-filter(dat, grepl(test2,dat$V1))

#time manipulation

  t$V2<-as.POSIXlt(t$V2,"EST")

  t$V2<-floor_date(t$V2, "15 minutes")

  #below is efforts to replace missing time slots

  q<-data.frame(V1=dirc[1],V2=seq(from = t[1,2], length.out = 96, by = 900))

  q<-q[0,]

  for (ty in 1:length(dirc)){

  re<-data.frame(V1=dirc[ty],V2=seq(from = t[1,2], length.out = 96, by = 900))

  q<-rbind(q,re)

  }

  df1<-merge(x=q,y=t, by=c("V1","V2"), all.x=TRUE)

  df1$V5<-na.locf(df1$V4) # replacing NA values with nearest value

  colnames(df1) <- c("DETECTOR_ID", "DateTime","Vol", "CorrectedVol") 

  df<-merge(x=df1,y=direct,by="DETECTOR_ID",all.x=TRUE)

  play<-ddply(df, .(DEVICE_ID,DateTime,SITE_NAME, Latitude,Longitude, Direction), summarise, Vol=sum(CorrectedVol))

 

   #detecting weekday

  for (r in 1:nrow(play)){play[r,8]<-weekdays(as.Date(play[r,2],"EST"))}

  colnames(play) <- c("SITE_ID", "DateTime","SITE_NAME", "Latitude","Longitude","Direction","Vol","Weekdays")

    

  #Time aggregation loop

  p=4  #step size for R loop!!!!!!!!!!!!!!!!!!! #parameter that should be customized by user "2" for half hour "4" for a full hour

  my <- play[order(play$SITE_ID, play$Direction,play$DateTime),]

  mydata1<-my[0,]

  for (i in seq(1,nrow(my),p)){

    mydata1[i,1:6]<- my[i,1:6]

    mydata1[i,8]<- my[i,8]

    mydata1[i,7] <- sum(my[i:(i+p-1),7])}

  mydata1<-na.omit(mydata1)    

    a<-rbind(a, mydata1)

  remove(play)

  remove(mydata1)

    print(paste("day",m))

 

  #generated file for each day

  write.csv(a, file = paste('Generatednew/',m,'th-day-selected.csv', sep=""), sep=",")

}

#Alldays Data for 1 hour step

write.csv(a, file = 'Generatednew/SelectedDevices.csv', sep=",")