#' stem words
#' 
#' Apply a stemmer to words.  This is a wrapper to \link[SnowballC]{wordStem} 
#' designed to allow this function to be called without loading the entire 
#' \pkg{SnowballC} package.  \link[SnowballC]{wordStem}  uses Martin Porter's 
#' stemming algorithm and the C libstemmer library generated by Snowball.
#' @param x a character vector or corpus, whose word stems are to be removed
#' @param language the name of a recognized language, as returned by 
#'   \link[SnowballC]{getStemLanguages}, or a two- or three-letter ISO-639 code 
#'   corresponding to one of these languages (see references for the list of 
#'   codes)
#' @return A character vector with as many elements as there are in the input 
#'   vector with the corresponding elements being the stem of the word. Elements
#'   of the vector are converted to UTF-8 encoding before the stemming is 
#'   performed, and the returned elements are marked as such when they contain 
#'   non-ASCII characters.
#' @seealso \link[SnowballC]{wordStem}
#'   
#' @references \url{http://snowball.tartarus.org/}
#'   
#'   \url{http://www.iso.org/iso/home/standards/language_codes.htm} for the 
#'   ISO-639 language codes
#' @export
#' @examples
#' #' Simple example
#' wordstem(c("win", "winning", "wins", "won", "winner"))
wordstem <- function(x, language = "porter") {
    UseMethod("wordstem")
}

#' @rdname wordstem
#' @import stringi 
#' @export
wordstem.character <- function(x, language = "porter") {
    if (any(stringi::stri_detect_fixed(x, " ")))
        stop("whitespace detected: you can only stem tokenized texts")
    SnowballC::wordStem(x, language)
}

# 
# toks <- unlist(tokenize(toLower(inaugTexts[1:5]), removePunct = TRUE, removeNumbers = TRUE), use.names = FALSE)
# microbenchmark(wordstem(toks), 
#                wordstemP(toks),
#                simplify2array(parallel::mclapply(toks, wordstem, language=language)))

#' @rdname wordstem
#' @import stringi 
#' @export
wordstem.tokenizedTexts <- function(x, language = "porter") {
    if (any(unlist(lapply(x, stringi::stri_detect_fixed, " "))))
        stop("whitespace detected: you can only stem tokenized texts")
    if (identical(attributes(x)$ngrams, 1))
        result <- lapply(x, SnowballC::wordStem, language)
    else {
        result <- wordstem_Ngrams(x, attributes(x)$concatenator, language)
    }
    class(result) <- c("tokenizedTexts", class(result))
    result
}


# stemming for ngrams, internal function
wordstem_Ngrams <- function(x, concatenator, language) {
    result <- lapply(x, strsplit, concatenator, fixed = TRUE)
    result <- lapply(result, function(y) lapply(y, SnowballC::wordStem, language = language))
    result <- lapply(result, function(y) sapply(y, paste, collapse = concatenator))
    # simple way to return a character vector if supplied a character vector
    if (!is.list(x) == 1) result <- unlist(result)
    result
}


#' @rdname wordstem
#' @import stringi 
#' @export
wordstem.dfm <- function(x, language = "porter") {
    # triplet representation, so we can get j index
    j <- as(x, "TsparseMatrix")@j + 1

    oldFeatures <- features(x)[j]
    if (identical(x@ngrams, 1)) 
        oldFeaturesStemmed <- wordstem(oldFeatures, language)
    else
        oldFeaturesStemmed <- wordstem_Ngrams(oldFeatures, x@concatenator, language)
    newFeatures <- unique(oldFeaturesStemmed)
    newFeatureIndex <- match(oldFeaturesStemmed, newFeatures)

    result <- sparseMatrix(i = x@i + 1, 
                           j = newFeatureIndex,
                           x = x@x, 
                           dimnames = list(docs = docnames(x), 
                                           features = newFeatures))
    new("dfmSparse", result)
}


# wordstem2 <- function(x, language = "porter") {
#     
#     dt <- data.table(stemmedFeatures = wordstem(rep(features(x), each = ndoc(x)), language),
#                      docIndex = rep(1:ndoc(x), nfeature(x)),
#                      counts = as.vector(x))
#     setkey(dt, docIndex, stemmedFeatures)
#     dt <- dt[, list(newCounts = sum(counts)), by = list(docIndex, stemmedFeatures)]
#     
#     newFeatures <- unique(dt, by = "stemmedFeatures")[, stemmedFeatures]
#     
#     result <- sparseMatrix(i = dt$docIndex,
#                            j = rep(1:length(newFeatures), each = ndoc(x)),
#                            x = dt$newCounts, 
#                            dimnames=list(docs = docnames(x), 
#                                          features = newFeatures))
#     new("dfmSparse", result)
# }
# 


# testText <- c("Dog runs, dogs run, and doggy is running.", 
#               "The running man likes to run.")
# xtoks <- tokenize(toLower(testText), removePunct = TRUE)
# (xdfm <- dfmNew(xtoks))
# wordstem(x)
# microbenchmark::microbenchmark(dfmNew(wordstem(xtoks), toLower = FALSE, verbose = FALSE), 
#                                wordstem(xdfm), 
#                                wordstem2(xdfm))


# FOR LEMMATIZATION:
# (see http://stackoverflow.com/questions/22993796/lemmatizer-in-r-or-python-am-are-is-be/22994954#22994954)
#     
#     lemmatize <- function(wordlist) {
#         get.lemma <- function(word, url) {
#             response <- GET(url,query=list(spelling=word,standardize="",
#                                            wordClass="",wordClass2="",
#                                            corpusConfig="ncf",    # Nineteenth Century Fiction
#                                            media="xml"))
#             content <- content(response,type="text")
#             xml     <- xmlInternalTreeParse(content)
#             return(xmlValue(xml["//lemma"][[1]]))    
#         }
#         require(httr)
#         require(XML)
#         url <- "http://devadorner.northwestern.edu/maserver/lemmatizer"
#         return(sapply(wordlist,get.lemma,url=url))
#     }
# 
# words <- c("is","am","was","are")
# lemmatize(words)
# #   is   am  was  are 
# # "be" "be" "be" "be" 
