使用 js写的简单爬虫,爬取牛客网前端题目。
在牛客网页面,打开控制台拷贝代码运行即可。解析到dom
树方便进行精确搜索,避免写过长的正则表达式进行全文匹配(可能存在漏处理的情况)。
关键代码:
const parser=new DOMParser()
const html = parser.parseFromString(result, "text/html")
let ul = html.querySelector('.final-pagination ul')
完整:
async function getNowCodeQuestions() {
let curPage = 1
let pageCount = 10000
let questions = []
while (pageCount>=curPage){
let result = await new Promise((resolve,reject)=>{
$.get(`/ta/review-frontend/review?page=${curPage}`,function(result){
resolve(result)
})
})
const parser=new DOMParser()
const html = parser.parseFromString(result, "text/html")
let ul = html.querySelector('.final-pagination ul')
pageCount = +ul.children[ul.childElementCount-2].innerText
var question = html.querySelector('.final-question').outerHTML
var answer = html.querySelector('.design-answer-box').outerHTML
questions.push({
id: curPage,
question,
answer
})
console.log('爬取进度:'+curPage+'/'+pageCount)
curPage+=1
}
return questions
}
getNowCodeQuestions().then(questions=>{
let result = questions.reduce((txt,item)=>{
return txt+=`题目${item.id}:${item.question}\n答案:${item.answer}\n`
},"<div class=\"title\">牛客网前端题目</div>\n")
console.log(result)
})