library(tidyverse)
library(here)
library(fs)
library(readxl)
library(janitor)

#This script is check when the CTD pump started working

#Read in the full, not sectioned casts
data_full <- tibble(files = fs::dir_ls(here("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv")))  %>% #created a list of files to be imported
  mutate(data = pmap(list(files),
                     ~ read_tsv(..1, col_names = FALSE))) %>% #imported the files
  mutate(data = pmap(list(files, data), 
                     ~ mutate(..2, source_file = as.character(..1)))) %>% #added file name to each row in each file
  select(data) %>%
  map_df(bind_rows) #joined all tables in filtered_data into one
data_full <- data_full[!grepl("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/SBE19plus_01907783_20220814-CTD075.cnv", data_full$source_file),] #delete row with failed cast
data_full <- data_full[!grepl("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/SBE19plus_01907783_20220814-CTD076.cnv", data_full$source_file),] #delete row with failed cast
data_full <- data_full[!grepl("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/SBE19plus_01907783_20220814-CTD077.cnv", data_full$source_file),] #delete row with failed cast
data_full <- data_full[!grepl("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/SBE19plus_01907783_20220814-CTD078.cnv", data_full$source_file),] #delete row with failed cast
data_full <- data_full[!grepl("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/SBE19plus_01907783_20220814-CTD079.cnv", data_full$source_file),] #delete row with failed cast
data_full <- data_full[!grepl("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/SBE19plus_01907783_20220815-CTD084.cnv", data_full$source_file),] #delete row with failed cast

#Read in the sectioned casts
data_sectioned <- tibble(files = fs::dir_ls(here("C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/02_section")))  %>% #created a list of files to be imported
  mutate(data = pmap(list(files),
                     ~ read_tsv(..1, col_names = FALSE))) %>% #imported the files
  mutate(data = pmap(list(files, data), 
                     ~ mutate(..2, source_file = as.character(..1)))) %>% #added file name to each row in each file
  select(data) %>%
  map_df(bind_rows) #joined all tables in filtered_data into one

#Subset rows containing conductivity frequency threshold
data_full_min_cond_freq <- data_full %>% filter(grepl("MinimumCondFreq", X1)) #kept rows containing minimum conductivity frequency threshold

#Clean up the data column
data_full_min_cond_freq <- data_full_min_cond_freq  %>%  mutate(X1 = str_remove_all(X1, "<MinimumCondFreq>"))
data_full_min_cond_freq <- data_full_min_cond_freq  %>%  mutate(X1 = str_remove_all(X1, "\\*"))
data_full_min_cond_freq <- data_full_min_cond_freq  %>%  mutate(X1 = str_remove_all(X1, "</MinimumCondFreq>"))

#Convert the data column to numeric format and change heading
data_full_min_cond_freq <- data_full_min_cond_freq %>% mutate(X1 = as.numeric(X1)) 
names(data_full_min_cond_freq)[names(data_full_min_cond_freq) == "X1"] <- "min_cond_freq"

#Subset rows containing pump delay
data_full_pump_delay <- data_full %>% filter(grepl("PumpDelay", X1)) #kept rows containing pump delay after the min. cond. freq. threshold has been met

#Clean up the data column
data_full_pump_delay <- data_full_pump_delay  %>%  mutate(X1 = str_remove_all(X1, "<PumpDelay>"))
data_full_pump_delay <- data_full_pump_delay  %>%  mutate(X1 = str_remove_all(X1, "\\*"))
data_full_pump_delay <- data_full_pump_delay  %>%  mutate(X1 = str_remove_all(X1, "</PumpDelay>"))

#Convert the data column to numeric format and change heading
data_full_pump_delay <- data_full_pump_delay %>% mutate(X1 = as.numeric(X1)) 
names(data_full_pump_delay)[names(data_full_pump_delay) == "X1"] <- "pump_delay"
 
#Subset rows containing sample rate
data_full_sample_rate <- data_full %>% filter(grepl("# interval", X1)) #kept rows containing sampling rate

#Clean up the data column
data_full_sample_rate <- data_full_sample_rate  %>%  mutate(X1 = str_remove_all(X1, "# interval = seconds: "))

#Convert the data column to numeric format and change heading
data_full_sample_rate <- data_full_sample_rate %>% mutate(X1 = as.numeric(X1))
names(data_full_sample_rate)[names(data_full_sample_rate) == "X1"] <- "sample_rate"

#Merge the tables above
criteria <- left_join(data_full_min_cond_freq, data_full_pump_delay, by = "source_file")
criteria <- left_join(criteria, data_full_sample_rate, by = "source_file")

#Calculate how many samples are skipped after the min.cond.freq. threshold is met and before the pump starts working 
criteria$skip_samples <- criteria$pump_delay / 1/criteria$sample_rate

#Figure out at what sample the min.cond.freq. threshold was met for each cast
data_full <- data_full %>% 
  filter(!grepl("#", X1)) %>% #removed rows containing # (metadata)
  filter(!grepl("\\*", X1)) #removed rows containing * (metadata)

data_full <- separate(data = data_full, col = X1,  sep = " +", into = c("0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29")) #split the data column

data_full <- data_full %>% select(c("source_file", "26", "0")) #subset the needed variables (source file name, frequency 1 channel, scan count)
names(data_full)[names(data_full) == "26"] <- "freq" #renamed a column
data_full <- data_full %>% mutate(freq = as.numeric(freq))  #made the frequency column numeric
names(data_full)[names(data_full) == "0"] <- "scan_count" #renamed a column
data_full <- data_full %>% mutate(scan_count = as.numeric(scan_count))  #made the scan count column numeric

data_full <- left_join(data_full, criteria, by = "source_file") #joined with criteria column

data_full$diff <- data_full$min_cond_freq - data_full$freq #calculated difference between min.cond.freq. threshold and the cond.freq.

#Find which scan number of each cast corresponds to the first sample where the min.cond.freq. threshold has been met 
data_full_first_occurrence_of_neg <- data_full %>%
  dplyr::filter(diff < 0) %>%
  group_by(source_file) %>%
  dplyr::filter(row_number() == 1) %>%
  ungroup()
  
#Calculate at which scan number the pump would have started working by adding the scan count corresponding to the first sample where the min.cond.freq. threshold has been met and the samples skipped during the pump delay
data_full_first_occurrence_of_neg$first_scan_pump_on <- data_full_first_occurrence_of_neg$scan_count + data_full_first_occurrence_of_neg$skip_samples
data_full_first_occurrence_of_neg <-  data_full_first_occurrence_of_neg %>% select(c("source_file", "first_scan_pump_on")) #subset the needed columns
data_full_first_occurrence_of_neg <- data_full_first_occurrence_of_neg  %>%  mutate(source_file = str_remove_all(source_file, "C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/01_datacnv/")) #remove some text to create a key column

#Get the scan count of the first scan in the sectioned casts
data_sectioned <- data_sectioned %>% filter(grepl("# span 0", X1)) #kept rows containing span 0 (cast number range)
data_sectioned <- separate(data = data_sectioned, col = X1,  sep = " +", into = c("a","b","c","d","e","f")) #split data column into several
data_sectioned <- data_sectioned %>% mutate(e = str_remove_all(e, ",")) #removed commas from start scan column
data_sectioned <- data_sectioned %>% select(c("e", "source_file")) #subset needed columns
names(data_sectioned)[names(data_sectioned) == "e"] <- "first_scan_of_sectioned_cast" #renamed a column
data_sectioned <- data_sectioned  %>%  mutate(source_file = str_remove_all(source_file, "C:/Users/katey/Desktop/2022 CTD Data Processing/2022_wk_auto_ctd_sn7783/data/02_section/")) #remove some text to create a key column

#Merge the tables
merged <- left_join(data_sectioned, data_full_first_occurrence_of_neg, by = "source_file")
merged <- merged %>% mutate(first_scan_of_sectioned_cast = as.numeric(first_scan_of_sectioned_cast))  #made the column numeric

#Calculate difference between first scan of sectioned cast and first scan pump is on
merged$scan_diff <- merged$first_scan_of_sectioned_cast - merged$first_scan_pump_on
merged <- merged[!grepl("SBE19plus_01907783_20220815-CTD082.cnv", merged$source_file),] #delete row with failed cast (by running this script before, I saw that minimum cond freq was never met during this cast)
merged <- merged[!grepl("SBE19plus_01907783_20220815-CTD083.cnv", merged$source_file),] #delete row with failed cast (by running this script before, I saw that minimum cond freq was never met during this cast)

#CHECK THAT THESE VALUES ARE ALL ZERO!
sum(is.na(merged)) #if 0, then there are no NAs in the df
length(which(merged$scan_diff < 0)) #if 0, then the pump started working before the first scan of the sectioned cast. Therefore, pump was working throughout the whole downcast and upcast.
