## Basic utilities for vector and string conversion
## ------------------------------------------------------------------------------------------

#' Split a `","`-joined string back to a vector (generic `ggDNAvis` helper)
#'
#' Takes a string (character) produced by [vector_to_string()] and recreates the vector.\cr\cr
#' Note that if a vector of multiple strings is input (e.g. `c("1,2,3", "9,8,7"`)) the output
#' will be a single concatenated vector (e.g. `c(1, 2, 3, 9, 8, 7)`).\cr\cr
#' If the desired output is a list of vectors, try [lapply()] e.g.
#' `lapply(c("1,2,3", "9,8,7"), string_to_vector)` returns `list(c(1, 2, 3), c(9, 8, 7))`.
#'
#' @param string `character`. A comma-separated string (e.g. `"1,2,3"`) to convert back to a vector.
#' @param type `character`. The type of the vector to be returned i.e. `"numeric"` (default), `"character"`, or `"logical"`.
#' @param sep `character`. The character used to separate values in the string. Defaults to `","`. *Do not set to anything that might occur within one of the values*.
#' @return `<type> vector`. The resulting vector (e.g. `c(1, 2, 3)`).
#'
#' @examples
#' ## String to numeric vector (default)
#' string_to_vector("1,2,3,4")
#' string_to_vector("1,2,3,4", type = "numeric")
#' string_to_vector("1;2;3;4", sep = ";")
#'
#' ## String to character vector
#' string_to_vector("A,B,C,D", type = "character")
#'
#' ## String to logical vector
#' string_to_vector("TRUE FALSE TRUE", type = "logical", sep = " ")
#'
#' ## By default, vector inputs are concatenated
#' string_to_vector(c("1,2,3", "4,5,6"))
#'
#' ## To create a list of vector outputs, use lapply()
#' lapply(c("1,2,3", "4,5,6"), string_to_vector)
#'
#' @export
string_to_vector <- function(string, type = "numeric", sep  = ",") {
    if (tolower(type) == "numeric") {
        return(as.numeric(unlist(strsplit(string, split = sep))))
    } else if (tolower(type) == "character") {
        return(as.character(unlist(strsplit(string, split = sep))))
    } else if (tolower(type) == "logical") {
        return(as.logical(unlist(strsplit(string, split = sep))))
    } else {
        abort(paste("Didn't recognise vector type:", type, "\n(currently set up for numeric, character, or logical only)"), class = "argument_value_or_type")
    }
}

#' Join a vector into a comma-separated string (generic `ggDNAvis` helper)
#'
#' Takes a vector and condenses it into a single string by joining items with `","`.
#' Reversed by [string_to_vector()].
#'
#' @param vector `vector`. A vector (e.g. `c(1,2,3)`) to convert to a string.
#' @param sep `character`. The character used to separate values in the string. Defaults to `","`. *Do not set to anything that might occur within one of the values*.
#' @return `character`. The same vector but as a comma-separated string (e.g. `"1,2,3"`).
#'
#' @examples
#' vector_to_string(c(1, 2, 3, 4))
#' vector_to_string(c("These", "are", "some", "words"))
#' vector_to_string(3:5, sep = ";")
#'
#' @export
vector_to_string <- function(vector, sep = ",") {paste(vector, collapse = sep)}


#' Print a numeric vector to console (`ggDNAvis` debug helper)
#'
#' Takes a numeric vector, and prints it to the console separated by `", "`.\cr\cr
#' This allows the output to be copy-pasted into a vector within an R script.
#' Used for taking vector outputs and then writing them as literals within a script. \cr\cr
#' E.g. when given input `1:5`, prints `1, 2, 3, 4, 5`, which can be directly copy-pasted
#' within `c()` to input that vector. Printing normally via `print(1:5)` instead prints
#' `[1] 1 2 3 4 5`, which is not valid vector input so can't be copy-pasted directly.\cr\cr
#' See [debug_join_vector_str()] for the equivalent for character/string vectors.
#'
#' @param vector `numeric vector`. Usually generated by some other function. This function allows copy-pasting the output to directly create a vector with this value.
#' @return None (invisible `NULL`) - uses [cat()] to output directly to console.
#'
#' @examples
#' debug_join_vector_num(1:5)
#'
#' @export
debug_join_vector_num <- function(vector) {cat(paste(vector, collapse = ", "))}


#' Print a character/string vector to console (`ggDNAvis` debug helper)
#'
#' Takes a character/string vector, and prints it to the console separated by `", "`.\cr\cr
#' This allows the output to be copy-pasted into a vector within an R script.
#' Used for taking vector outputs and then writing them as literals within a script. \cr\cr
#' E.g. when given input `strsplit("ABCD", split = "")[[1]]`, prints `"A", "B", "C", "D"`,
#' which can be directly copy-pasted within `c()` to input that vector.
#' Printing normally via `print(strsplit("ABCD", split = "")[[1]])` instead prints
#' `[1] "A" "B" "C" "D"`, which is not valid vector input so can't be copy-pasted directly.\cr\cr
#' See [debug_join_vector_num()] for the equivalent for numeric vectors.
#'
#' @param vector `character vector`. Usually generated by some other function. This function allows copy-pasting the output to directly create a vector with this value.
#' @return None (invisible `NULL`) - uses [cat()] to output directly to console.
#'
#' @examples
#' debug_join_vector_str(c("A", "B", "C", "D"))
#'
#' @export
debug_join_vector_str <- function(vector) {cat('"', paste(vector, collapse = '", "'), '"', sep = "")}

## ------------------------------------------------------------------------------------------




#' Reverse complement a DNA/RNA sequence (generic `ggDNAvis` helper)
#'
#' This function takes a string/character representing a DNA/RNA sequence and returns
#' the reverse complement. Either DNA (`A/C/G/T`) or RNA (`A/C/G/U`) input is accepted. \cr\cr
#' By default, output is DNA (so `A` is reverse-complemented to `T`), but it can be set
#' to output RNA (so `A` is reverse-complemented to `U`).
#'
#' @param sequence `character`. A DNA/RNA sequence (`A/C/G/T/U`) to be reverse-complemented. No other characters allowed. Only one sequence allowed.
#' @param output_mode `character`. Either `"DNA"` (default) or `"RNA"`, to determine whether `A` should be reverse-complemented to `T` or to `U`.
#' @return `character`. The reverse-complement of the input sequence.
#'
#' @examples
#' reverse_complement("ATGCTAG")
#' reverse_complement("UUAUUAGC", output_mode = "RNA")
#' reverse_complement("AcGtU", output_mode = "DNA")
#' reverse_complement("aCgTU", output_mode = "RNA")
#'
#' @export
reverse_complement <- function(sequence, output_mode = "DNA") {
    for (argument in list(sequence, output_mode)) {
        if (any(is.null(argument)) == TRUE || any(is.na(argument)) == TRUE) {
            abort(paste("Argument", argument, "must not be NULL or NA"), class = "argument_value_or_type")
        }
    }
    if (length(sequence) != 1) {
        abort("Can only input one sequence at once. Try sapply(input_vector, reverse_complement) to use on more than one input.", class = "argument_length")
    }
    if (length(output_mode) != 1) {
        abort("Output mode must be a single value (either 'DNA' or 'RNA')", class = "argument_length")
    }
    if (is.character(sequence) == FALSE || is.character(output_mode) == FALSE) {
        abort("Sequence and output mode must both be character/string values.", class = "argument_value_or_type")
    }
    if (nchar(sequence) == 0) {
        return("")
    }
    sequence_vector     <- strsplit(toupper(sequence), split = "")[[1]]
    reversed_vector     <- rev(sequence_vector)
    new_sequence_vector <- rep(NA, length(reversed_vector))

    for (i in 1:length(reversed_vector)) {
        if (reversed_vector[i] == "A") {
            if (toupper(output_mode) == "DNA") {
                new_sequence_vector[i] <- "T"
            } else if (toupper(output_mode) == "RNA") {
                new_sequence_vector[i] <- "U"
            } else {
                abort("Output mode must be set to either 'DNA' (default) or 'RNA'", class = "argument_value_or_type")
            }
        } else if (reversed_vector[i] == "C") {
            new_sequence_vector[i] <- "G"
        } else if (reversed_vector[i] == "G") {
            new_sequence_vector[i] <- "C"
        } else if (reversed_vector[i] %in% c("T", "U")) {
            new_sequence_vector[i] <- "A"
        } else {
            abort("Cannot reverse sequence for non-A/C/G/T/U", class = "argument_value_or_type")
        }
    }

    new_sequence <- paste(new_sequence_vector, collapse = "")
    return(new_sequence)
}



## These next two functions work together to encode
## sequence numerically for visualisation via `raster::raster()`.
## A = 1, C = 2, G = 3, T/U = 4, blank = 0

#' Map a single base to the corresponding number (generic `ggDNAvis` helper)
#'
#' This function takes a single base and numerically
#' encodes it for visualisation via [raster::raster()]. \cr\cr
#' Encoding: `A = 1`, `C = 2`, `G = 3`, `T/U = 4`.
#'
#' @param base `character`. A single DNA/RNA base to encode numerically (e.g. `"A"`).
#' @return `integer`. The corresponding number.
#'
#' @examples
#' convert_base_to_number("A")
#' convert_base_to_number("c")
#' convert_base_to_number("g")
#' convert_base_to_number("T")
#' convert_base_to_number("u")
#'
#' @export
convert_base_to_number <- function(base) {
    if (length(base) != 1) {
        abort("Can only input one base at once", class = "argument_length")
    }

    base <- toupper(base)
    if (base == "A") {
        number <- 1
    } else if (base == "C") {
        number <- 2
    } else if (base == "G") {
        number <- 3
    } else if (base %in% c("T", "U")) {
        number <- 4
    } else {
        abort("Base must be one of A/C/G/T/U to convert to number", class = "argument_value_or_type")
    }
    return(number)
}

#' Map a sequence to a vector of numbers (generic `ggDNAvis` helper)
#'
#' This function takes a sequence and encodes it as a vector
#' of numbers for visualisation via [raster::raster()]. \cr\cr
#' Encoding: `A = 1`, `C = 2`, `G = 3`, `T/U = 4`.
#'
#' @param sequence `character`. A DNA/RNA sequence (`A/C/G/T/U`) to be encoded numerically. No other characters allowed. Only one sequence allowed.
#' @param length `integer`. How long the output numerical vector should be. If shorter than the sequence, the vector will include the first *n* bases up to this length. If longer than the sequence, the vector will be padded with 0s at the end. If left blank/set to `NA` (default), will output a vector the same length as the input sequence.
#' @return `integer vector`. The numerical encoding of the input sequence, cut/padded to the desired length.
#'
#' @examples
#' convert_sequence_to_numbers("ATCGATCG")
#' convert_sequence_to_numbers("ATCGATCG", length = NA)
#' convert_sequence_to_numbers("ATCGATCG", length = 4)
#' convert_sequence_to_numbers("ATCGATCG", length = 10)
#'
#' @export
convert_sequence_to_numbers <- function(sequence, length = NA) {
    ## Tests to make sure length is something sensible
    if (length(sequence) != 1) {
        abort("Sequence must be a single character/string value", class = "argument_length")
    }
    if (length(length) != 1) {
        abort("Length must be a single integer (or NA) value", class = "argument_length")
    }
    if (is.na(length)) {
        length <- nchar(sequence)
    }
    if (is.numeric(length) == FALSE || length %% 1 != 0 || length < 0) {
        abort("Length must be a non-negative integer or NA", class = "argument_value_or_type")
    }

    if (length == 0) {     ## specifically not else if, to return empty num vector if length not specified but sequence length is 0
        return(numeric(0))
    }

    numerical_vector <- NULL
    for (i in 1:length) {
        if (i <= nchar(sequence)) {
            numerical_vector[i] <- convert_base_to_number(substr(sequence, i, i))
        } else {
            numerical_vector[i] <- 0
        }
    }

    return(numerical_vector)
}



#' Rasterise a vector of sequences into a numerical dataframe for ggplotting (generic `ggDNAvis` helper)
#'
#' Takes a character vector of sequences (which are allowed to be empty `""` to
#' act as a spacing line) and rasterises it into a dataframe that ggplot can read.
#'
#' @param sequences `character vector`. A vector of sequences for plotting, e.g. `c("ATCG", "", "GGCGGC", "")`. Each sequence will be plotted left-aligned on a new line.
#' @return `dataframe`. Rasterised dataframe representation of the sequences, readable by [ggplot2::ggplot()].
#'
#' @examples
#' create_image_data(c("ATCG", "", "GGCGGC", ""))
#'
#' @export
create_image_data <- function(sequences) {
    if (is.character(sequences) == FALSE) {
        abort("Must input a character vector of sequences", class = "argument_value_or_type")
    }

    max_length <- max(nchar(sequences))
    image_matrix <- matrix(NA, nrow = length(sequences), ncol = max_length)
    for (i in 1:length(sequences)) {
        numeric_sequence_representation <- convert_sequence_to_numbers(sequences[i], max_length)
        image_matrix[i, ] <- numeric_sequence_representation
    }

    image_data <- raster::as.data.frame(raster::raster(image_matrix), xy = TRUE)
    return(image_data)
}
