mirror of
https://github.com/briatte/awesome-network-analysis.git
synced 2024-10-01 01:06:13 -04:00
102 lines
1.8 KiB
R
102 lines
1.8 KiB
R
library(httr)
|
|
library(stringr)
|
|
|
|
f <- "check.log"
|
|
a <- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"
|
|
|
|
if (!file.exists(f)) {
|
|
|
|
u <- str_c(
|
|
"https://raw.githubusercontent.com/",
|
|
"briatte/awesome-network-analysis/",
|
|
"master/README.md"
|
|
)
|
|
|
|
cat("Source:", u, "\n")
|
|
|
|
u <- GET(u) %>%
|
|
content("text") %>%
|
|
str_split("\\n") %>% # so as to find [foo]: bar links
|
|
unlist
|
|
|
|
# total number of links
|
|
t <- str_count(u, "http") %>%
|
|
sum
|
|
|
|
cat(t, "URLs, ")
|
|
|
|
l <- c(
|
|
# [foo](bar)
|
|
str_extract_all(u, "\\(http(.*?)\\)") %>%
|
|
lapply(str_replace_all, "^\\(|\\)$", "") %>%
|
|
unlist,
|
|
# [foo]: bar
|
|
str_extract_all(u, "^\\[(.*)\\]: (.*)") %>%
|
|
unlist %>%
|
|
str_replace("^\\[(.*)\\]: (.*)", "\\2")
|
|
)
|
|
|
|
stopifnot(length(l) == t)
|
|
|
|
} else {
|
|
|
|
cat("Source:", f, "\n")
|
|
|
|
l <- readLines(f) %>%
|
|
str_subset("^http")
|
|
|
|
cat(length(l), "URLs, ")
|
|
|
|
}
|
|
|
|
l <- unique(l) %>%
|
|
sort
|
|
|
|
cat(length(l), "unique\n")
|
|
|
|
sink(f, append = FALSE)
|
|
cat(as.character(Sys.time()), ": checking", length(l), "URLs\n\n")
|
|
sink()
|
|
|
|
for (i in l) {
|
|
|
|
x <- try(GET(i, user_agent(a)) %>%
|
|
status_code,
|
|
silent = TRUE)
|
|
|
|
if (!"try-error" %in% class(x) && x != 200) {
|
|
|
|
cat("X")
|
|
|
|
sink(f, append = TRUE)
|
|
cat(i, "\nStatus code:", x, "\n\n")
|
|
sink()
|
|
|
|
} else if("try-error" %in% class(x)) {
|
|
|
|
cat("?")
|
|
|
|
sink(f, append = TRUE)
|
|
cat(i, "\nFailed to access\n\n")
|
|
sink()
|
|
Sys.sleep(5)
|
|
|
|
} else {
|
|
|
|
cat(".")
|
|
|
|
}
|
|
|
|
if (!which(l == i) %% 50)
|
|
cat("", length(l) - which(l == i), "left\n")
|
|
|
|
}
|
|
|
|
sink(f, append = TRUE)
|
|
cat(as.character(Sys.time()), ": done.\n")
|
|
sink()
|
|
|
|
cat("Found", str_count(readLines(f), "^http") %>%
|
|
sum,
|
|
"problems.\n")
|