# Imports
import os
import pandas as pd
import datefinder
import datetime
import time
from datetime import timedelta

from init import *

cnv_dataframes=list()
cnv_dateTimes=list()
cnvfilenames=list()

# Read all the cnv files from thermosalinograph folder (from init file)
for root, dirs, files in os.walk(input_directory_TSG):
    for filename in files:
        if filename.endswith('.cnv') and not filename.startswith('~') and root==input_directory_TSG:
            cnv_file = filename.split(".cnv")[0]      # remove cnv extension, will be used to save output file as xlsx
            cnv_filepath=os.path.join(root, filename)

            #Read the contents of the cnv file line by line
            cnv = open(cnv_filepath, mode="r", encoding="latin-1")
            cnv_file_contents = cnv.readlines()
            cnv.close()

            #-----------------------------------------------------------------------------------------------------
                # Getting the metadata, preceded by an *
            #-----------------------------------------------------------------------------------------------------
            allData = cnv_file_contents[2::]  #Get all the data from the cnv file except the first 2 lines

            i = 0
            j = 0

            metadata = {}
            notes_meta = {}

            while allData[i][0] == '*':  #Get the first first character. * indicates metadata information in the cnv file

                #Create a dictionary for the metadata, with the keys being anything before '=' or ':'. Additional metadata d=goes in notes_meta
                if allData[i].__contains__('='):   
                    metadata.update({allData[i][2::].rsplit('=')[0].strip(): allData[i][2::].rsplit('=')[1][:-1].strip()})  #:-1 returns all elements [:] except the last one -1
                elif allData[i].__contains__(':'):
                    metadata.update({allData[i][2::].rsplit(':')[0].strip(): allData[i][2::].rsplit(':')[1][:-1].strip()})
                else:
                    notes_meta.update({'Note {0}'.format(j): allData[i][2:-1].strip()})
                    j += 1
                i += 1
            metadata.update(notes_meta)
            i += 2

            #-------------------------------------------------------------------------------------------------------------------------------
            # write the metadata with xml form into an xml file. Preceded by #. Get the original varibale descriptions as well as the span
            #-------------------------------------------------------------------------------------------------------------------------------

            # VarMetadata contains the variable names and units for e.g name 1 = latitude: Latitude [deg]
            # span has the ranges for the different varibales. 
            # xml_data has all the other xml data (line starts with #)    

            varMetadata= {}
            span={}
            xml_data={}

            meta_xml = os.path.join(input_directory_TSG, filename[:-4] + '_metaxml.xml') #Write xml data to xml file
            xml_file = open(meta_xml, 'w')
            xml_file.write('<?xml version="1.0"?>')
            xml_file.write('\n')

            while allData[i][0] == '#':

                if not allData[i][2::].__contains__('<') and allData[i][2::].__contains__('name'):   # <Sensors count="8" >
                    varMetadata.update({allData[i][2::].rsplit('=')[0].strip() : allData[i][2::].rsplit('=')[1].strip()}) # Dictionary of addiitonal metadata ---> 'name 1': 'timeJ: Julian Days',

                elif not allData[i][2::].__contains__('<') and allData[i][2::].__contains__('span'):
                    span.update({allData[i][2::].rsplit('=')[0].strip() : allData[i][2::].rsplit('=')[1].strip()})

                elif not allData[i][2::].__contains__('<') and not allData[i][2::].__contains__('span') and not allData[i][2::].__contains__('name'):
                    xml_data.update({allData[i][2::].rsplit('=')[0].strip() : allData[i][2::].rsplit('=')[1].strip()})

                else:
                    xml_file.writelines(allData[i][2::][:-1])   # additional metadata
                    xml_file.write('\n')


                # Check for the start date/time tag: start_time. E.g., start_time = Jul 13 2021 19:20:08 [System UTC, first data scan.]
                # Use datefinder to extract the date in the start_time tag

                if allData[i][2::].__contains__('start_time'):
                    start_time=allData[i][2::].rsplit('=')[1]
                    matches = datefinder.find_dates(start_time)
                    for match in matches:
                        start_dateTime=match


                #Check for the time Interval 
                if allData[i][2::].__contains__('interval'):
                    interval=int(allData[i][2::].rsplit(':')[1])

                i += 1
            xml_file.close()


            # Split the varMetadata to get only the cnv variable names (varHeaders), for eg latitude: Latitude [deg]. 
            # latitude-varHeaders_1; Latitude [deg]-varHeaders_2. varHeaders_1 is used as the column headers
            #----------------------------------------------------------------------------------------------------------
            varHeaders_1=[]
            varHeaders_2=[]

            #  split variables
            for x in range(len(varMetadata.values())):
                varHeaders_1.append(list(varMetadata.values())[x].split(':')[0])
                varHeaders_2.append(list(varMetadata.values())[x].split(':')[1])


            #Get the actual data 
            #----------------------------------------------------------------------------------------------------------
            cnvdataOrig=allData[i+1:][:] #This gets all the data from this point (start of data records) to end of file. One giant list of strings
            rawData=[]
            for x in range(len(cnvdataOrig)):
                cnvdata2=cnvdataOrig[x][:-1].strip().rsplit(' ') #Get individual lines, with data seperated by a space to represent the different columns
                cnvdata2=[y for y in cnvdata2 if y]
                rawData.append(cnvdata2) # Append each individul line of tabular data to a list


            # Convert tabular list to a dataframe called seabird_data
            #----------------------------------------------------------------------------------------------------------
            seabird_data=pd.DataFrame(rawData, columns=varHeaders_1)

            # Remove the flag column- has no valuable infomration
            if 'flag' in seabird_data.columns:
                seabird_data = seabird_data.drop('flag', axis=1)


            #Calculate the dates and times and add as column to data frame
            #---------------------------------------------------------------------------------------------------------

            #Calculate a range of dates starting with the start date and interval taken from the cnv file
            date_time_cnv=[] #list containg range of dates
            start=start_dateTime
            for c in range(len(seabird_data)):
                if c==0:
                    date_time_cnv.append(start)

                else: 
                    dt=start + datetime.timedelta(0,interval)
                    date_time_cnv.append(dt)
                    start=dt

            #Add datetime list as column in dataframe
            seabird_data['date_time']=date_time_cnv


            #Add data frame and datetime lists for each cnv file to the larger lists (for all the files, we are looping through multiple cnv files)
            #----------------------------------------------------------------------------------------------------------

            # Add data frame to a list for all the cnv files
            cnv_dataframes.append(seabird_data)

            # Add the dateTime list to a list for all the cnv files
            cnv_dateTimes.append(date_time_cnv)

            #Convert each cnv dataframe to csv file with added date_time
            csvname=output_directory_TSG+'/'+cnv_file+'.csv'
            seabird_data.to_csv(csvname,index=False)

print('All Done!!')

