-
Johannes Boog authoredJohannes Boog authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
analyse_xml.R 8.66 KiB
#===== analyse_xml =====
#' analyse_xml
#' @description
#' This is a helper function which can be used to combat missing
#' documentation. It looks for files in a path which match the given pattern
#' and then attempts to parse them as XML. For each occurence of the specified
#' element, it documents its attributes and direct children and prints a little
#' summary of its findings at the end.
#' @param path string: A path
#' @param pattern string: A regex pattern
#' @param xpath string: An XPath expression. WARNING: Only works for expressions
#' that return nodesets, use it to look up tag names.
#' @param print_findings Optional: Should the results be printed to the console?
#' @noRd
analyse_xml <- function(path,
pattern,
xpath,
print_findings = TRUE) {
path <- as_dir_path(path)
assertthat::assert_that(assertthat::is.string(pattern))
assertthat::assert_that(assertthat::is.string(xpath))
xml_files <- list.files(path = path, pattern = pattern, recursive = TRUE)
files_found <- length(xml_files)
valid_files_count <- 0
valid_files_names <- character()
element_found_files_names <- character()
invalid_files_count <- 0
invalid_files_names <- character()
total_matches <- 0
attr_ex_counts <- list()
child_ex_counts <- list()
child_tot_counts <- list()
#Return values, will contain a named list of flags
attr_flags <- list()
child_flags <- list()
if(length(xml_files) == 0) {
#stop(paste("No files found for pattern ", pattern), call. = FALSE)
warning("No files found for pattern ", pattern)
return(invisible(NULL))
}
for(i in seq_len(length(xml_files))){
skip_to_next <- FALSE
out<- tryCatch(
{
xml_doc <- xml2::read_xml(paste0(path, xml_files[[i]]),
encoding="ISO-8859-1")
},
error = function(cond){
invalid_files_count <<- invalid_files_count + 1
invalid_files_names <<- c(invalid_files_names,
basename(xml_files[[i]]))
skip_to_next <<- TRUE
}
)
if(skip_to_next){
next
}
valid_files_count <- valid_files_count + 1
valid_files_names <- c(valid_files_names, basename(xml_files[[i]]))
doc_matches <- xml2::xml_find_all(xml_doc, xpath)
total_matches <- total_matches + length(doc_matches)
if(length(doc_matches) > 0){
element_found_files_names <- c(element_found_files_names,
xml_files[[i]])
}
#Get attribute names and counts
for (j in seq_len(length(doc_matches))){
attr_names <- names(xml2::xml_attrs(doc_matches[[j]]))
for(k in seq_len(length(attr_names))) {
if(!attr_names[[k]] %in% names(attr_ex_counts)) {
attr_ex_counts[[attr_names[[k]]]] <- 1
}else{
attr_ex_counts[[attr_names[[k]]]] <-
attr_ex_counts[[attr_names[[k]]]] + 1
}
}
}
#Get child names and counts
for (j in seq_len(length(doc_matches))) {
children <- xml2::xml_children(doc_matches[[j]])
first_found <- list()
for (k in seq_len(length(children))) {
child_name <- xml2::xml_name(children[[k]])
if (!child_name %in% first_found) {
first_found <- c(first_found, child_name)
if (!child_name %in% names(child_ex_counts)) {
child_ex_counts[[child_name]] <- 1
} else{
child_ex_counts[[child_name]] <-
child_ex_counts[[child_name]] + 1
}
}
if (!child_name %in% names(child_tot_counts)) {
child_tot_counts[[child_name]] <- 1
} else{
child_tot_counts[[child_name]] <-
child_tot_counts[[child_name]] + 1
}
}
}
}
make_df_vector <- function(x){
round((x / total_matches), 4)
}
#Turn attribute data into a nice data frame
attr_name_count_vector <- unlist(attr_ex_counts, use.names=FALSE)
attr_name_occ_vector <-
unlist(lapply (attr_ex_counts, make_df_vector))
attr_df <- data.frame(name = names(attr_ex_counts),
occ = attr_name_count_vector,
p_occ = attr_name_occ_vector)
if(length(attr_name_count_vector) > 0){
attr_df <- attr_df[order(attr_df$p_occ, decreasing = TRUE),]
rownames(attr_df) <- seq(1, length(attr_name_count_vector))
}
attr_flags <- get_required(attr_df$name, attr_df$p_occ)
#Turn children data into a nice data frame
child_ex_count_vector <- unlist(child_ex_counts, use.names=FALSE)
child_ex_occ_vector <-
unlist(lapply(child_ex_counts, make_df_vector))
child_tot_count_vector <- unlist(child_tot_counts, use.names=FALSE)
child_tot_occ_vector <-
unlist(lapply(child_tot_counts, make_df_vector))
child_df <- data.frame(name = names(child_ex_counts),
ex_occ = child_ex_count_vector,
p_occ = child_ex_occ_vector,
total = child_tot_count_vector,
total_mean = child_tot_occ_vector)
if(length(child_ex_count_vector) > 0){
child_df <- child_df[order(child_df$p_occ, decreasing = TRUE),]
rownames(child_df) <- seq(1, length(child_ex_count_vector))
}
child_flags <- get_required(child_df$name, child_df$p_occ)
if(print_findings) {
print_analysis_findings(
invalid_files_count,
invalid_files_names,
valid_files_count,
xpath,
element_found_files_names,
total_matches,
attr_ex_counts,
attr_df,
child_ex_counts,
child_df)
}
both_flags <- sort(c(attr_flags, child_flags), decreasing = TRUE)
#Return attributes and children (if found)
return(invisible(
list(
xpath = xpath,
children = child_flags,
attributes = attr_flags,
both_sorted = both_flags
)
))
}
#===== print_analysis_findings =====
print_analysis_findings <- function(invalid_files_count,
invalid_files_names,
valid_files_count,
xpath,
element_found_files_names,
total_matches,
attr_ex_counts,
attr_df,
child_ex_counts,
child_df) {
if(invalid_files_count > 0) {
cat("I skipped ", invalid_files_count,
" malformed XML files matching your pattern:\n", sep = "")
for(i in seq_len(length(invalid_files_names))){
cat(invalid_files_names[[i]], "\n")
}
}
cat("\nI parsed ", valid_files_count,
" valid XML files matching your pattern.\n", sep = "")
tag_name <- get_tag_from_xpath(xpath)
if(length(element_found_files_names) > 0){
cat("\nI found at least one element named ",
tag_name, " in the following file(s):\n", sep = "")
for(i in seq_len(length(element_found_files_names))){
cat(element_found_files_names[[i]], "\n")
}
}
cat("\nIn total, I found ", total_matches,
" element(s) named ", tag_name, ".\n", sep = "")
if(length(attr_ex_counts) > 0) {
cat("\nThese are the attributes I found:\n")
print(attr_df)
}
if(length(child_ex_counts) > 0){
cat("\nThese are the child elements I found:\n")
print(child_df)
}
return(invisible())
}
#===== get_required =====
#' get_required
#' @description
#' Helper function to mark required attributes or children from a
#' vector of names and a vector of occurrence probabilities
#' @param names A vector of names
#' @param occurence_probabilities A vector of occurrence probabilities
#' @noRd
get_required <- function(names, occurence_probabilities){
required <- logical()
for(i in seq_len(length(names))) {
if(occurence_probabilities[[i]] < 1) {
required[[paste0(names[[i]])]] <- FALSE
}else{
required[[paste0(names[[i]])]] <- TRUE
}
}
return(required)
}