有这么一个需求,首先从cvs文件中读取要解析的url数据,然后使用puppeteer和puppeteer-har来获取浏览器的HAR数据。在调试的过程中,发现在for循环中怎么操作都是异步的,最后找到了一个解决方案,也算在此记录。

1,创建解析csv文件的代码(ultra-harlog/module/cvsresovle.js)

const fs = require("fs");const path = require("path");const csv =require('csv');const parse = require('csv-parse/lib/sync')const iconv = require('iconv-lite'); /*npm install iconv-lite */function readUrlRecord(csvpath){ console.log('开始解析文件:' + csvpath) ; //读取文件 const input = fs.readFileSync(csvpath,'utf8') ; /* 解析文件,生成JSON格式 { ' ': '142', AREA_NAME: '湖北', SITE_LINK: 'www.banggo.com', BEARING_MODE: '移动接入', SITE_NAME: '邦购', MENU_TYPE: '二级' } */ const records = parse(input, { columns: true, skip_empty_lines: true, delimiter: ',', }) ; return records ;}//readUrlRecord('../top300.csv') ;exports.readUrlRecord = readUrlRecord;2,创建抓取主代码(ultra-harlog/module/puppeteerhar-event.js)

const fs = require('fs');const { promisify } = require('util');const path = require("path");const puppeteer = require('puppeteer');const { harFromMessages } = require('chrome-har');const logger=require("./log");const log = logger.getPuppeteerHarEventRecordLogger() ;//https://michaljanaszek.com/blog/generate-har-with-puppeteer//https://www.npmjs.com/package/chrome-har// 设置要监控的事件const observe = [ 'Page.loadEventFired', 'Page.domContentEventFired', 'Page.frameStartedLoading', 'Page.frameAttached', 'Network.requestWillBeSent', 'Network.requestServedFromCache', 'Network.dataReceived', 'Network.responseReceived', 'Network.resourceChangedPriority', 'Network.loadingFinished', 'Network.loadingFailed',];/* 启动浏览器*/ async function launchBrowser(){ //启动浏览器实例 [puppeteer.createBrowserFetcher([options])] let browser = await puppeteer.launch({ // 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/ //executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium', //如果是访问https页面 此属性会忽略https错误 ignoreHTTPSErrors: true, // 关闭headless模式, 不会打开浏览器 headless: true, //浏览器启动参数 https://peter.sh/experiments/chromium-command-line-switches/ --timeout args:["--disk-cache-size=0","--disable-cache",'--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'], //是否为每个选项卡自动打开DevTools面板。 如果此选项为true,则headless选项将设置为false。 devtools: false, //Defaults to 30000 (30 seconds). Pass 0 to disable timeout. timeout: 0 //放慢puppeteer执行的动作,方便调试 //slowMo: 250 }); return browser ;}async function saveHarlog(url,dirPath,filename){ let homesite = url ; //保存的文件路径 let harFilePath = path.join(dirPath,filename) ; //处理URL if(!(url.startsWith('http://') || url.startsWith('https://'))){ url = "http://" + url ; } //打开浏览器 let browser = await launchBrowser() ; //创建一个新页面 //let page = await browser.newPage(); let page = (await browser.pages())[0]; // 注册事件监听器 const client = await page.target().createCDPSession(); await client.send('Page.enable'); await client.send('Network.enable'); //用于保存用于转为为HAR数据的事件 const events = []; observe.forEach(method => { client.on(method, params => { events.push({ method, params }); }); }); try{ // 执行跳转,访问制定的资源 await page.goto(url,{ timeout:0 }); }catch(error){ log.info('resovle error :' + url + "; error message:" + error) ; }finally{ if(browser){ await browser.close(); } } const har = harFromMessages(events); //resovleHar(har) ; //log.info(JSON.stringify(har)); await promisify(fs.writeFile)(harFilePath, JSON.stringify(har));}exports.launchBrowser = launchBrowser;exports.saveHarlog = saveHarlog;3,创建启动文件(ultra-harlog/puppeteerhar-event-app.js)

const fs = require("fs");const path = require("path");const moment = require("moment");const schedule = require('node-schedule');const cvsresovler=require("./module/cvsresovle");const mhar=require("./module/puppeteerhar-event");/*cnpm install --save momentcnpm install --save csvcnpm install --save node-schedulecnpm install --save puppeteercnpm install --save puppeteer-harcnpm install --save iconv-litecnpm install --save chrome-harcnpm install --save grpc*/ function init(){ console.log('初始化调度器') ; //每分钟的第30秒定时执行一次: schedule.scheduleJob('0 55 8 * * *',()=>{ let ftime = moment().format('YYYYMMDDHHmm'); console.log('当前调度时间为:' + ftime) ; let dirPath = path.join(__dirname,'harlogs',ftime) ; console.log("创建目录:" + dirPath) ; let isExist = false ; if(fs.existsSync(dirPath)){ //创建文件夹 let stat = fs.lstatSync(dirPath); if(stat.isDirectory()){ isExist = true ; } } if(!isExist){ //创建文件夹 console.log("创建文件夹" + ftime) ; fs.mkdirSync(dirPath); } //开始解析需要处理的URL let dataArr = cvsresovler.readUrlRecord(path.join(__dirname,'top300.csv')) ; console.log("解析出URL共计" + dataArr.length + "条") ; /* 开始抓取HAR数据【同步的方式执行】。 注意:如果这里直接通过for循环遍历dataArr并调用saveHarlog方法,那么这将是一个异步的过程。 */ (async function iterator(i){ let data = dataArr[i] let url = data['SITE_LINK'] ; url = url.trim() ; let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ; if(url){ console.log((i+1) + "-starting to resovle url :" + url ) ; try{ await mhar.saveHarlog(url,dirPath,"N" + "-" + filename) ; }catch(error){ console.log(error) ; } } if(i + 1 < dataArr.length){ iterator(i+1) ; } })(0) ; }); console.log('应用程序启动完成') ;}//执行init();/** 用于测试的方法*/async function test(){ let ftime = moment().format('YYYYMMDDHHmm'); console.log('当前执行时间为:' + ftime) ; let dirPath = path.join(__dirname,'harlogs',ftime) ; console.log("创建目录:" + dirPath) ; let isExist = false ; if(fs.existsSync(dirPath)){ //创建文件夹 let stat = fs.lstatSync(dirPath); if(stat.isDirectory()){ isExist = true ; } } if(!isExist){ //创建文件夹 console.log("创建文件夹" + ftime) ; fs.mkdirSync(dirPath); } //测试的URL let url = "www.baidu.com" ; let arguments = process.argv.splice(2); if(arguments.length > 0 ){ url = arguments[0] ; } url = url.trim() ; let filename = url.replace(/\//g,'-').replace(/\\/g,'-') ; if(url){ if(!(url.startsWith('http://') || url.startsWith('https://'))){ url = "http://" + url ; } console.log("starting to resovle test url :" + url ) ; try{ await mhar.saveHarlog(url,dirPath,"NT" + "-" + filename) ; }catch(error){ console.log(error) ; } }}//运行测试//test() ;