Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
analyse_xml.R 8.57 KiB

#===== analyse_xml =====


#' analyse_xml
#' @description
#' This is a helper function which can be used to combat missing
#' documentation. It looks for files in a path which match the given pattern
#' and then attempts to parse them as XML. For each occurence of the specified
#' element, it documents its attributes and direct children and prints a little
#' summary of its findings at the end.
#' @param path string: A path
#' @param pattern string: A regex pattern
#' @param xpath string: An XPath expression. WARNING: Only works for expressions
#'   that return nodesets, use it to look up tag names.
#' @param print_findings Optional: Should the results be printed to the console?
#' @noRd
analyse_xml <- function(path,
                        pattern,
                        xpath,
                        print_findings = TRUE) {

    path <- as_dir_path(path)
    assertthat::assert_that(assertthat::is.string(pattern))
    assertthat::assert_that(assertthat::is.string(xpath))

    xml_files <- list.files(path = path, pattern = pattern, recursive = TRUE)

    files_found <- length(xml_files)

    valid_files_count <- 0
    valid_files_names <- character()

    element_found_files_names <- character()

    invalid_files_count <- 0
    invalid_files_names <- character()

    total_matches <- 0

    attr_ex_counts <- list()
    child_ex_counts <- list()
    child_tot_counts <- list()

    #Return values, will contain a named list of flags
    attr_flags <- list()
    child_flags <- list()

    if(length(xml_files) == 0) {
        stop(paste("No files found for pattern ", pattern), call. = FALSE)
    }

    for(i in seq_len(length(xml_files))){

        skip_to_next <- FALSE

        out<- tryCatch(
            {
                xml_doc <- xml2::read_xml(paste0(path, xml_files[[i]]),
                                          encoding="ISO-8859-1")
            },

            error = function(cond){
                invalid_files_count <<- invalid_files_count + 1
                invalid_files_names <<- c(invalid_files_names,
                                          basename(xml_files[[i]]))
                skip_to_next <<- TRUE
            }
        )
        if(skip_to_next){
            next
        }

        valid_files_count <- valid_files_count + 1
        valid_files_names <- c(valid_files_names, basename(xml_files[[i]]))

        doc_matches <- xml2::xml_find_all(xml_doc, xpath)
        total_matches <- total_matches + length(doc_matches)

        if(length(doc_matches) > 0){
            element_found_files_names <- c(element_found_files_names,
                                           xml_files[[i]])
        }

        #Get attribute names and counts
        for (j in seq_len(length(doc_matches))){
            attr_names <- names(xml2::xml_attrs(doc_matches[[j]]))

            for(k in seq_len(length(attr_names))) {
                if(!attr_names[[k]] %in% names(attr_ex_counts)) {
                    attr_ex_counts[[attr_names[[k]]]] <- 1
                }else{
                    attr_ex_counts[[attr_names[[k]]]] <-
                        attr_ex_counts[[attr_names[[k]]]] + 1
                }
            }
        }

        #Get child names and counts
        for (j in seq_len(length(doc_matches))) {
            children <- xml2::xml_children(doc_matches[[j]])

            first_found <- list()

            for (k in seq_len(length(children))) {
                child_name <- xml2::xml_name(children[[k]])

                if (!child_name %in% first_found) {
                    first_found <- c(first_found, child_name)

                    if (!child_name %in% names(child_ex_counts)) {
                        child_ex_counts[[child_name]] <- 1
                    } else{
                        child_ex_counts[[child_name]] <-
                            child_ex_counts[[child_name]] + 1
                    }
                }

                if (!child_name %in% names(child_tot_counts)) {
                    child_tot_counts[[child_name]] <- 1
                } else{
                    child_tot_counts[[child_name]] <-
                        child_tot_counts[[child_name]] + 1
                }
            }
        }
    }

    make_df_vector <- function(x){
        round((x / total_matches), 4)
    }

    #Turn attribute data into a nice data frame

    attr_name_count_vector <- unlist(attr_ex_counts, use.names=FALSE)

    attr_name_occ_vector <-
        unlist(lapply (attr_ex_counts, make_df_vector))
    attr_df <- data.frame(name = names(attr_ex_counts),
                          occ = attr_name_count_vector,
                          p_occ = attr_name_occ_vector)

    if(length(attr_name_count_vector) > 0){
        attr_df <- attr_df[order(attr_df$p_occ, decreasing = TRUE),]
        rownames(attr_df) <- seq(1, length(attr_name_count_vector))
    }

    attr_flags <- get_required(attr_df$name, attr_df$p_occ)

    #Turn children data into a nice data frame

    child_ex_count_vector <- unlist(child_ex_counts, use.names=FALSE)

    child_ex_occ_vector <-
        unlist(lapply(child_ex_counts, make_df_vector))

    child_tot_count_vector <- unlist(child_tot_counts, use.names=FALSE)

    child_tot_occ_vector <-
        unlist(lapply(child_tot_counts, make_df_vector))

    child_df <- data.frame(name = names(child_ex_counts),
                           ex_occ = child_ex_count_vector,
                           p_occ = child_ex_occ_vector,
                           total = child_tot_count_vector,
                           total_mean = child_tot_occ_vector)

    if(length(child_ex_count_vector) > 0){
        child_df <- child_df[order(child_df$p_occ, decreasing = TRUE),]
        rownames(child_df) <- seq(1, length(child_ex_count_vector))
    }

    child_flags <- get_required(child_df$name, child_df$p_occ)

    if(print_findings) {
        print_analysis_findings(
            invalid_files_count,
            invalid_files_names,
            valid_files_count,
            xpath,
            element_found_files_names,
            total_matches,
            attr_ex_counts,
            attr_df,
            child_ex_counts,
            child_df)
    }

    both_flags <- sort(c(attr_flags, child_flags), decreasing = TRUE)

    #Return attributes and children (if found)
    return(invisible(
        list(
            xpath = xpath,
            children = child_flags,
            attributes = attr_flags,
            both_sorted = both_flags
        )
    ))
}


#===== print_analysis_findings =====


print_analysis_findings <- function(invalid_files_count,
                                    invalid_files_names,
                                    valid_files_count,
                                    xpath,
                                    element_found_files_names,
                                    total_matches,
                                    attr_ex_counts,
                                    attr_df,
                                    child_ex_counts,
                                    child_df) {

    if(invalid_files_count > 0) {
        cat("I skipped ", invalid_files_count,
            " malformed XML files matching your pattern:\n", sep = "")
        for(i in seq_len(length(invalid_files_names))){
            cat(invalid_files_names[[i]], "\n")
        }
    }

    cat("\nI parsed ", valid_files_count,
        " valid XML files matching your pattern.\n", sep = "")

    tag_name <- get_tag_from_xpath(xpath)

    if(length(element_found_files_names) > 0){
        cat("\nI found at least one element named ",
            tag_name, " in the following file(s):\n", sep = "")

        for(i in seq_len(length(element_found_files_names))){
            cat(element_found_files_names[[i]], "\n")
        }
    }

    cat("\nIn total, I found ", total_matches,
        " element(s) named ", tag_name, ".\n", sep = "")

    if(length(attr_ex_counts) > 0) {
        cat("\nThese are the attributes I found:\n")
        print(attr_df)
    }

    if(length(child_ex_counts) > 0){
        cat("\nThese are the child elements I found:\n")
        print(child_df)
    }

    return(invisible())
}


#===== get_required =====


#' get_required
#' @description
#' Helper function to mark required attributes or children from a
#' vector of names and a vector of occurrence probabilities
#' @param names A vector of names
#' @param occurence_probabilities A vector of occurrence probabilities
#' @noRd
get_required <- function(names, occurence_probabilities){

    required <- logical()

    for(i in seq_len(length(names))) {
        if(occurence_probabilities[[i]] < 1) {
            required[[paste0(names[[i]])]] <- FALSE
        }else{
            required[[paste0(names[[i]])]] <- TRUE
        }
    }

    return(required)
}