2017-01-02 13:01:11 -05:00
|
|
|
#
|
|
|
|
# Common errors
|
|
|
|
# -------------
|
|
|
|
# 400 Bad Request
|
|
|
|
# 403 Forbidden (e.g. Nature website)
|
|
|
|
# 404 Not Found
|
|
|
|
# 501 Not Implemented
|
2017-01-02 13:46:42 -05:00
|
|
|
# 999 LinkedIn being defensive
|
2017-01-02 13:01:11 -05:00
|
|
|
#
|
2017-01-01 12:54:03 -05:00
|
|
|
library(httr)
|
|
|
|
library(stringr)
|
|
|
|
|
2017-01-01 21:08:36 -05:00
|
|
|
f <- "check.log"
|
2017-01-01 12:54:03 -05:00
|
|
|
|
2017-01-01 21:08:36 -05:00
|
|
|
if (!file.exists(f)) {
|
|
|
|
|
|
|
|
u <- str_c(
|
|
|
|
"https://raw.githubusercontent.com/",
|
|
|
|
"briatte/awesome-network-analysis/",
|
|
|
|
"master/README.md"
|
|
|
|
)
|
|
|
|
|
|
|
|
cat("Source:", u, "\n")
|
|
|
|
|
|
|
|
u <- GET(u) %>%
|
|
|
|
content("text") %>%
|
|
|
|
str_split("\\n") %>% # so as to find [foo]: bar links
|
|
|
|
unlist
|
|
|
|
|
2019-04-26 15:26:52 -04:00
|
|
|
# total number of links (made to match web.archive.org links only once)
|
|
|
|
t <- sum(str_count(u, "(?<!/)http"))
|
2017-01-01 21:08:36 -05:00
|
|
|
|
|
|
|
cat(t, "URLs, ")
|
|
|
|
|
|
|
|
l <- c(
|
|
|
|
# [foo](bar)
|
2019-04-26 15:26:52 -04:00
|
|
|
str_extract_all(u, "\\(http(.*?)\\)") %>%
|
2017-01-01 21:08:36 -05:00
|
|
|
lapply(str_replace_all, "^\\(|\\)$", "") %>%
|
|
|
|
unlist,
|
|
|
|
# [foo]: bar
|
|
|
|
str_extract_all(u, "^\\[(.*)\\]: (.*)") %>%
|
|
|
|
unlist %>%
|
|
|
|
str_replace("^\\[(.*)\\]: (.*)", "\\2")
|
|
|
|
)
|
|
|
|
|
|
|
|
stopifnot(length(l) == t)
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
cat("Source:", f, "\n")
|
|
|
|
|
2019-04-26 15:26:52 -04:00
|
|
|
l <- str_subset(stringi::stri_read_lines(f), "^http")
|
2017-01-01 21:08:36 -05:00
|
|
|
|
|
|
|
cat(length(l), "URLs, ")
|
|
|
|
|
|
|
|
}
|
2017-01-01 12:54:03 -05:00
|
|
|
|
2023-04-26 18:04:13 -04:00
|
|
|
l <- str_squish(sort(unique(l)))
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-01-01 12:54:03 -05:00
|
|
|
cat(length(l), "unique\n")
|
|
|
|
|
2023-04-26 18:04:30 -04:00
|
|
|
cat("Ignoring", sum(str_detect(l, "^https://doi.org/")), "DOIs\n")
|
|
|
|
l <- str_subset(l, "^https://doi.org/", negate = TRUE)
|
|
|
|
|
2017-01-01 21:08:36 -05:00
|
|
|
sink(f, append = FALSE)
|
2017-01-01 13:16:31 -05:00
|
|
|
cat(as.character(Sys.time()), ": checking", length(l), "URLs\n\n")
|
|
|
|
sink()
|
|
|
|
|
2017-01-01 12:54:03 -05:00
|
|
|
for (i in l) {
|
|
|
|
|
2017-03-14 04:40:30 -04:00
|
|
|
x <- try(status_code(GET(i)), silent = TRUE)
|
2017-01-01 12:54:03 -05:00
|
|
|
|
|
|
|
if (!"try-error" %in% class(x) && x != 200) {
|
2017-01-01 13:16:31 -05:00
|
|
|
|
|
|
|
cat("X")
|
|
|
|
|
2017-01-01 21:08:36 -05:00
|
|
|
sink(f, append = TRUE)
|
2017-01-01 13:16:31 -05:00
|
|
|
cat(i, "\nStatus code:", x, "\n\n")
|
2017-01-01 13:05:09 -05:00
|
|
|
sink()
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-03-14 04:40:30 -04:00
|
|
|
} else if ("try-error" %in% class(x)) {
|
2017-01-01 13:16:31 -05:00
|
|
|
|
|
|
|
cat("?")
|
|
|
|
|
2017-01-01 21:08:36 -05:00
|
|
|
sink(f, append = TRUE)
|
2017-01-01 13:16:31 -05:00
|
|
|
cat(i, "\nFailed to access\n\n")
|
2017-01-01 13:05:09 -05:00
|
|
|
sink()
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-01-01 12:54:03 -05:00
|
|
|
} else {
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-01-01 12:54:03 -05:00
|
|
|
cat(".")
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-01-01 12:54:03 -05:00
|
|
|
}
|
|
|
|
|
2017-03-14 04:40:30 -04:00
|
|
|
if (!which(l == i) %% 50) {
|
|
|
|
|
2017-01-01 13:16:31 -05:00
|
|
|
cat("", length(l) - which(l == i), "left\n")
|
2017-03-14 04:40:30 -04:00
|
|
|
|
|
|
|
}
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-01-01 12:54:03 -05:00
|
|
|
}
|
2017-01-01 13:16:31 -05:00
|
|
|
|
2017-01-01 21:08:36 -05:00
|
|
|
sink(f, append = TRUE)
|
2017-01-01 13:16:31 -05:00
|
|
|
cat(as.character(Sys.time()), ": done.\n")
|
|
|
|
sink()
|
|
|
|
|
2019-04-26 15:26:52 -04:00
|
|
|
cat("\n", sum(str_count(stringi::stri_read_lines(f), "^http")), "problems\n")
|