1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| library("stringr") library("rvest") library('RCurl') library('curl') library('downloader') library('RSelenium') setwd('H:\\R\\R脚本\\爬虫') url <- "https://www.-----.com/yuedu/15111/" htmlpage <- read_html(url) xs_chapter <- htmlpage %>% html_nodes('div#wrapper div.box_con div#list dl dd a') %>% html_text() head(xs_chapter) xs_link <- htmlpage %>% html_nodes('div#wrapper div.box_con div#list dl dd a') %>% html_attrs() head(xs_link) xs_link <- paste("https://www.-----.com",unlist(xs_link),sep = '') xs_content <- cbind(xs_chapter[-(1:10)],xs_link[-(1:10)])
ms_f1 <- read_html(xs_content[1,2], encoding = "GB18030") %>% html_nodes('div#content') %>% html_text() ms_f2 <- read_html(xs_content[2,2], encoding = "GB18030") %>% html_nodes('div#content') %>% html_text() ms_f <- c(xs_content[1,1],ms_f1,xs_content[2,1],ms_f2) write(gsub("\u00A0"," ", ms_f),file = "1.txt")
xs_down <- function(x){ xs <- c() for (i in 1:length(x[,1])) { xm <- read_html(x[i,2], encoding = "GB18030") %>% html_nodes('div#content') %>% html_text() xs <- c(xs, x[i,1], xm) } xs }
write(gsub("\u00A0"," ", xs_down(xs_content)),file = "战略级天使.txt")
|