# # Common errors # ------------- # 400 Bad Request # 403 Forbidden (e.g. Nature website) # 404 Not Found # 501 Not Implemented # 999 LinkedIn being defensive # library(httr) library(stringr) f <- "check.log" if (!file.exists(f)) { u <- str_c( "https://raw.githubusercontent.com/", "briatte/awesome-network-analysis/", "master/README.md" ) cat("Source:", u, "\n") u <- GET(u) %>% content("text") %>% str_split("\\n") %>% # so as to find [foo]: bar links unlist() # remove links that have been commented out u <- str_remove_all(u, "") # total number of links (made to match web.archive.org links only once) t <- sum(str_count(u, "(?% lapply(str_replace_all, "^\\(|\\)$", "") %>% unlist(), # [foo]: bar str_extract_all(u, "^\\[(.*)\\]: (.*)") %>% unlist() %>% str_replace("^\\[(.*)\\]: (.*)", "\\2") ) stopifnot(length(l) == t) } else { cat("Source:", f, "\n") l <- str_subset(stringi::stri_read_lines(f), "^http") cat(length(l), "URLs, ") } l <- str_squish(sort(unique(l))) cat(length(l), "unique\n") cat("Ignoring", sum(str_detect(l, "^https://doi.org/")), "DOIs\n") l <- str_subset(l, "^https://doi.org/", negate = TRUE) sink(f, append = FALSE) cat(as.character(Sys.time()), ": checking", length(l), "URLs\n\n") sink() for (i in l) { x <- try(status_code(GET(i)), silent = TRUE) if (!"try-error" %in% class(x) && x != 200) { cat("X") sink(f, append = TRUE) cat(i, "\nStatus code:", x, "\n\n") sink() } else if ("try-error" %in% class(x)) { cat("?") sink(f, append = TRUE) cat(i, "\nFailed to access\n\n") sink() } else { cat(".") } if (!which(l == i) %% 50) { cat("", length(l) - which(l == i), "left\n") } } sink(f, append = TRUE) f <- sum(str_count(stringi::stri_read_lines(f), "^http")) cat(as.character(Sys.time()), ": done,", f, "errors.\n") sink() cat("\n", f, "errors\n")