web

从牛课网爬题(数据解析成dom树)

使用js爬取多个页面数据,解析到dom便于查找

Posted by Youga on May 22, 2018

使用 js写的简单爬虫,爬取牛客网前端题目。 在牛客网页面,打开控制台拷贝代码运行即可。解析到dom树方便进行精确搜索,避免写过长的正则表达式进行全文匹配(可能存在漏处理的情况)。

关键代码:

const parser=new DOMParser()
const html = parser.parseFromString(result, "text/html")
let ul = html.querySelector('.final-pagination ul')

完整:

async function getNowCodeQuestions() {
    let curPage = 1
    let pageCount = 10000
    let questions = []
    while (pageCount>=curPage){
        let result = await new Promise((resolve,reject)=>{
            $.get(`/ta/review-frontend/review?page=${curPage}`,function(result){
                resolve(result)
            })
        })
        const parser=new DOMParser()
        const html = parser.parseFromString(result, "text/html")
        let ul = html.querySelector('.final-pagination ul')
        pageCount = +ul.children[ul.childElementCount-2].innerText
        var question = html.querySelector('.final-question').outerHTML
        var answer = html.querySelector('.design-answer-box').outerHTML
        questions.push({
            id: curPage,
            question,
            answer
        })
        console.log('爬取进度:'+curPage+'/'+pageCount)
        curPage+=1
    }
	return questions
}
getNowCodeQuestions().then(questions=>{
	let result = questions.reduce((txt,item)=>{
		return txt+=`题目${item.id}:${item.question}\n答案:${item.answer}\n`
	},"<div class=\"title\">牛客网前端题目</div>\n")
	console.log(result)
})