|  | @@ -0,0 +1,606 @@
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 重庆材料信息价爬虫
 | 
	
		
			
				|  |  | + * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的网站渲染的是静态内容,因此不需要使用puppeteer。
 | 
	
		
			
				|  |  | + * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据)
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +module.exports = {
 | 
	
		
			
				|  |  | +    crawlData,
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +const cheerio = require('cheerio');
 | 
	
		
			
				|  |  | +const axios = require('axios');
 | 
	
		
			
				|  |  | +const querystring = require('querystring');
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 页面类型
 | 
	
		
			
				|  |  | +const PageType = {
 | 
	
		
			
				|  |  | +    GENERAL: '/Index.aspx',
 | 
	
		
			
				|  |  | +    AREA: '/AreaIndex.aspx',
 | 
	
		
			
				|  |  | +    MIXED: '/ReadyMixedIndex.aspx',
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 获取主要材料信息价格页面表单数据
 | 
	
		
			
				|  |  | + * @param {Object} $ - 页面内容
 | 
	
		
			
				|  |  | + * @param {Object} props - 提交属性
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function getGeneralDataBody($, props) {
 | 
	
		
			
				|  |  | +    const body = {
 | 
	
		
			
				|  |  | +        __EVENTTARGET: props.eventTarget || '',
 | 
	
		
			
				|  |  | +        __EVENTARGUMENT: '',
 | 
	
		
			
				|  |  | +        __VIEWSTATE: $('#__VIEWSTATE').val(),
 | 
	
		
			
				|  |  | +        __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
 | 
	
		
			
				|  |  | +        ID_ucPrice$linkvv: props.period,
 | 
	
		
			
				|  |  | +        ID_ucPrice$linkcategory: props.materialClass || '',
 | 
	
		
			
				|  |  | +        ID_ucPrice$LinkValue: `${props.classID},${props.period},${props.materialClass || ''}`,
 | 
	
		
			
				|  |  | +        ID_ucPrice$txtsonclass: `sonclass${props.classID}`,
 | 
	
		
			
				|  |  | +        ID_ucPrice$txtfatherclass: $('#ID_ucPrice_txtfatherclass').val(),
 | 
	
		
			
				|  |  | +        ID_ucPrice$txtClassId: props.classID || '',
 | 
	
		
			
				|  |  | +        ID_ucPrice$ddlSearchYear: '请选择',
 | 
	
		
			
				|  |  | +        ID_ucPrice$ddlSearchMonth: '请选择',
 | 
	
		
			
				|  |  | +        ID_ucPrice$txtSearchCailiao: '',
 | 
	
		
			
				|  |  | +        ID_ucPrice$UcPager1$listPage: props.page && String(props.page) || '1',
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +    if (!props.eventTarget) {
 | 
	
		
			
				|  |  | +        body.ID_ucPrice$btnLink = $('#ID_ucPrice_btnLink').val();
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return body;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 获取各区县地方材料工地价格页面表单数据
 | 
	
		
			
				|  |  | + * @param {Object} $ - 页面内容
 | 
	
		
			
				|  |  | + * @param {Object} props - 提交属性
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function getAreaDataBody($, props) {
 | 
	
		
			
				|  |  | +    if (!props || !Object.keys(props).length) {
 | 
	
		
			
				|  |  | +        return {};
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    const body = {
 | 
	
		
			
				|  |  | +        __EVENTTARGET: props.eventTarget || '',
 | 
	
		
			
				|  |  | +        __EVENTARGUMENT: '',
 | 
	
		
			
				|  |  | +        __VIEWSTATE: $('#__VIEWSTATE').val(),
 | 
	
		
			
				|  |  | +        __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
 | 
	
		
			
				|  |  | +        ID_ucAreaPrice$linkvv: props.period,
 | 
	
		
			
				|  |  | +        ID_ucAreaPrice$LinkValue: '',
 | 
	
		
			
				|  |  | +        ID_ucAreaPrice$dropArea: 'code',
 | 
	
		
			
				|  |  | +        ID_ucAreaPrice$txtSearchCailiao: '',
 | 
	
		
			
				|  |  | +        ID_ucAreaPrice$UcPager1$listPage: props.page && String(props.page) || '1',
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +    if (!props.eventTarget) {
 | 
	
		
			
				|  |  | +        body.ID_ucAreaPrice$btnAreaMaster = 'Button';
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return body;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 获取预拌砂浆信息价格页面表单数据
 | 
	
		
			
				|  |  | + * @param {Object} $ - 页面内容
 | 
	
		
			
				|  |  | + * @param {Object} props - 提交属性
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function getMixedDataBody($, props) {
 | 
	
		
			
				|  |  | +    if (!props || !Object.keys(props).length) {
 | 
	
		
			
				|  |  | +        return {};
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    const body = {
 | 
	
		
			
				|  |  | +        __EVENTTARGET: props.eventTarget || '',
 | 
	
		
			
				|  |  | +        __EVENTARGUMENT: '',
 | 
	
		
			
				|  |  | +        __VIEWSTATE: $('#__VIEWSTATE').val(),
 | 
	
		
			
				|  |  | +        __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
 | 
	
		
			
				|  |  | +        ID_ucReadyMixedPrice$linkvv: props.period,
 | 
	
		
			
				|  |  | +        ID_ucReadyMixedPrice$LinkValue: '',
 | 
	
		
			
				|  |  | +        ID_ucReadyMixedPrice$dropArea: 'code',
 | 
	
		
			
				|  |  | +        ID_ucReadyMixedPrice$txtSearchCailiao: '',
 | 
	
		
			
				|  |  | +        ID_ucReadyMixedPrice$UcPager1$listPage: props.page && String(props.page) || '1',
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +    if (!props.eventTarget) {
 | 
	
		
			
				|  |  | +        body.ID_ucReadyMixedPrice$btnAreaMaster = 'Button';
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return body;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 获取提交
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +const TIME_OUT = 10000;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 创建axios实例
 | 
	
		
			
				|  |  | +const axiosInstance = axios.create({
 | 
	
		
			
				|  |  | +    baseURL: 'http://www.cqsgczjxx.org/Jgxx/',
 | 
	
		
			
				|  |  | +    timeout: TIME_OUT,
 | 
	
		
			
				|  |  | +    proxy: {
 | 
	
		
			
				|  |  | +        host: "127.0.0.1", port: "8888" // fiddler抓包
 | 
	
		
			
				|  |  | +    },
 | 
	
		
			
				|  |  | +    headers: {
 | 
	
		
			
				|  |  | +        'Cache-Control': 'max-age=0',
 | 
	
		
			
				|  |  | +        'Content-Type': 'application/x-www-form-urlencoded',
 | 
	
		
			
				|  |  | +        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
 | 
	
		
			
				|  |  | +        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 | 
	
		
			
				|  |  | +        'Accept-Encoding': 'gzip, deflate',
 | 
	
		
			
				|  |  | +        'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
 | 
	
		
			
				|  |  | +    },
 | 
	
		
			
				|  |  | +    responseType: 'document'
 | 
	
		
			
				|  |  | +});
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 响应拦截器
 | 
	
		
			
				|  |  | +axiosInstance.interceptors.response.use(function (response) {
 | 
	
		
			
				|  |  | +    return response;
 | 
	
		
			
				|  |  | +}, function (error) {
 | 
	
		
			
				|  |  | +    // 对响应错误做点什么
 | 
	
		
			
				|  |  | +    if (error.message.includes('timeout')) {
 | 
	
		
			
				|  |  | +        return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +        return Promise.reject(error);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +});
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 发起请求需要携带Cookie,否则一些请求会返回500错误(应该是网站的反爬措施)
 | 
	
		
			
				|  |  | +let curCookie = '';
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 加载页面,获取可用类jquery操作的数据
 | 
	
		
			
				|  |  | + * @param {String} url - 拼接的url
 | 
	
		
			
				|  |  | + * @param {Object} body - 表单数据
 | 
	
		
			
				|  |  | + * @return {DOM-LIKE} - cheerio解析html得到的类dom数据
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function loadPage(url, body) {
 | 
	
		
			
				|  |  | +    const config = {};
 | 
	
		
			
				|  |  | +    if (curCookie) {
 | 
	
		
			
				|  |  | +        config.headers = { Cookie: curCookie };
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    const rst = body ?
 | 
	
		
			
				|  |  | +        await axiosInstance.post(url, querystring.stringify(body), config) :
 | 
	
		
			
				|  |  | +        await axiosInstance.post(url, null, config);
 | 
	
		
			
				|  |  | +    // 更新cookie
 | 
	
		
			
				|  |  | +    const cookies = rst.headers['set-cookie'];
 | 
	
		
			
				|  |  | +    if (Object.prototype.toString.call(cookies) === '[object Array]') {
 | 
	
		
			
				|  |  | +        curCookie = cookies[0].split(';')[0];
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return cheerio.load(rst.data);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +const monthMap = {
 | 
	
		
			
				|  |  | +    '1': '01',
 | 
	
		
			
				|  |  | +    '2': '02',
 | 
	
		
			
				|  |  | +    '3': '03',
 | 
	
		
			
				|  |  | +    '4': '04',
 | 
	
		
			
				|  |  | +    '5': '05',
 | 
	
		
			
				|  |  | +    '6': '06',
 | 
	
		
			
				|  |  | +    '7': '07',
 | 
	
		
			
				|  |  | +    '8': '08',
 | 
	
		
			
				|  |  | +    '9': '09',
 | 
	
		
			
				|  |  | +    '10': '10',
 | 
	
		
			
				|  |  | +    '11': '11',
 | 
	
		
			
				|  |  | +    '12': '12',
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 获取期数数据
 | 
	
		
			
				|  |  | + * @param {String} from - 从哪一期开始 eg: 2020-01
 | 
	
		
			
				|  |  | + * @param {String} to - 从哪一期结束 eg: 2020-05
 | 
	
		
			
				|  |  | + * @param {Object} $index - cheerio加载的初始页面内容
 | 
	
		
			
				|  |  | + * @return {Array[object] || Null} eg: {period: '2020-05', uid: 'XCCXXXXX-XX'}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function getPeriodData(from, to, $index) {
 | 
	
		
			
				|  |  | +    if (from > to) {
 | 
	
		
			
				|  |  | +        return null;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    const $period = $index('#PriceLMenu')
 | 
	
		
			
				|  |  | +    // 根据区间获取期数列表
 | 
	
		
			
				|  |  | +    const reg = /(\d+)-(\d+)/;
 | 
	
		
			
				|  |  | +    const fromMatch = from.match(reg);
 | 
	
		
			
				|  |  | +    const fromYear = +fromMatch[1];
 | 
	
		
			
				|  |  | +    const fromMonth = +fromMatch[2];
 | 
	
		
			
				|  |  | +    const toMatch = to.match(reg);
 | 
	
		
			
				|  |  | +    const toYear = +toMatch[1];
 | 
	
		
			
				|  |  | +    const toMonth = +toMatch[2];
 | 
	
		
			
				|  |  | +    let curYear = fromYear;
 | 
	
		
			
				|  |  | +    let curMonth = fromMonth;
 | 
	
		
			
				|  |  | +    const list = [];
 | 
	
		
			
				|  |  | +    while (curYear <= toYear && curMonth <= toMonth) {
 | 
	
		
			
				|  |  | +        const uid = getPeriodUID(curYear, curMonth, $period);
 | 
	
		
			
				|  |  | +        // 存在无效期数,直接返回空
 | 
	
		
			
				|  |  | +        if (!uid) {
 | 
	
		
			
				|  |  | +            return null;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        list.push({
 | 
	
		
			
				|  |  | +            period: `${curYear}-${monthMap[curMonth]}`,
 | 
	
		
			
				|  |  | +            uid
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +        if (curMonth === 12) {
 | 
	
		
			
				|  |  | +            curYear++;
 | 
	
		
			
				|  |  | +            curMonth = 1;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +            curMonth++;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return list;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    function getPeriodUID(year, month, $period) {
 | 
	
		
			
				|  |  | +        const $year = $period.find('.MenuOneTitle').filter(function () {
 | 
	
		
			
				|  |  | +            return $index(this).text() === `${year}年`;
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +        if (!$year.length) {
 | 
	
		
			
				|  |  | +            return null;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        const $month = $year.parent().next().find('a').filter(function () {
 | 
	
		
			
				|  |  | +            return $index(this).text() === `${month}月`;
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +        if (!$month.length) {
 | 
	
		
			
				|  |  | +            return null;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        // 期数uid在onclick中,需要提取出来
 | 
	
		
			
				|  |  | +        const onclickText = $month.attr('onclick').toString();
 | 
	
		
			
				|  |  | +        const reg = /Onlink\('([^']+)'/;
 | 
	
		
			
				|  |  | +        const matched = onclickText.match(reg);
 | 
	
		
			
				|  |  | +        if (!matched || !matched[1]) {
 | 
	
		
			
				|  |  | +            return null;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        return matched[1];
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 表格类型
 | 
	
		
			
				|  |  | +const TableType = {
 | 
	
		
			
				|  |  | +    BUILDING: 1, // 主要材料中的建安工程材料和绿色
 | 
	
		
			
				|  |  | +    GARDEN: 2, // 主要材料中的园林绿化
 | 
	
		
			
				|  |  | +    ENERGY: 3, // 主要材料中的节能建筑工程材料
 | 
	
		
			
				|  |  | +    AREA: 4, // 地区相关(各区县材料)
 | 
	
		
			
				|  |  | +    MIXED: 5, // 地区相关(预拌砂浆)
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取表格数据
 | 
	
		
			
				|  |  | + * @param {Object} $page - 页面内容
 | 
	
		
			
				|  |  | + * @param {Number} type - 表格类型
 | 
	
		
			
				|  |  | + * @return {Array[object]}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function crawlTableData($page, type) {
 | 
	
		
			
				|  |  | +    switch (type) {
 | 
	
		
			
				|  |  | +        case TableType.BUILDING:
 | 
	
		
			
				|  |  | +        case TableType.ENERGY:
 | 
	
		
			
				|  |  | +            return crawlNormalTable($page);
 | 
	
		
			
				|  |  | +        case TableType.GARDEN:
 | 
	
		
			
				|  |  | +            return crawlGardenTable($page);
 | 
	
		
			
				|  |  | +        case TableType.AREA:
 | 
	
		
			
				|  |  | +            return crawlAreaTable($page, '#ID_ucAreaPrice_gridView');
 | 
	
		
			
				|  |  | +        case TableType.MIXED:
 | 
	
		
			
				|  |  | +            return crawlAreaTable($page, '#ID_ucReadyMixedPrice_gridView');
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return [];
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取表格数据,表格列为:
 | 
	
		
			
				|  |  | + * 序号	| 材料名称 | 规格型号 | 单位 | 含税价(元) | 不含税价(元) | 备注
 | 
	
		
			
				|  |  | + * @param {Object} $page - 页面内容
 | 
	
		
			
				|  |  | + * @return {Array[object]}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function crawlNormalTable($page) {
 | 
	
		
			
				|  |  | +    const colMap = {
 | 
	
		
			
				|  |  | +        0: 'name',
 | 
	
		
			
				|  |  | +        1: 'specs',
 | 
	
		
			
				|  |  | +        2: 'unit',
 | 
	
		
			
				|  |  | +        3: 'taxPrice',
 | 
	
		
			
				|  |  | +        4: 'noTaxPrice',
 | 
	
		
			
				|  |  | +        5: 'remark'
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +    const data = [];
 | 
	
		
			
				|  |  | +    let cur;
 | 
	
		
			
				|  |  | +    const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
 | 
	
		
			
				|  |  | +    $tdList.each(function (index) {
 | 
	
		
			
				|  |  | +        const col = index % 6;
 | 
	
		
			
				|  |  | +        if (col === 0) {
 | 
	
		
			
				|  |  | +            cur = {}
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        cur[colMap[col]] = $page(this).text();
 | 
	
		
			
				|  |  | +        if (col === 5) {
 | 
	
		
			
				|  |  | +            data.push(cur);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    });
 | 
	
		
			
				|  |  | +    console.log(data);
 | 
	
		
			
				|  |  | +    return data;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取表格数据,表格列为:
 | 
	
		
			
				|  |  | + * 序号 | 科属 | 品名 | 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 单位 | 含税价(元) | 不含税价(元) | 备注
 | 
	
		
			
				|  |  | + * @param {Object} $page - 页面内容
 | 
	
		
			
				|  |  | + * @return {Array[object]}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function crawlGardenTable($page) {
 | 
	
		
			
				|  |  | +    const colMap = {
 | 
	
		
			
				|  |  | +        0: 'genera',
 | 
	
		
			
				|  |  | +        1: 'name',
 | 
	
		
			
				|  |  | +        2: 'height',
 | 
	
		
			
				|  |  | +        3: 'branchDiameter',
 | 
	
		
			
				|  |  | +        4: 'crownDiameter',
 | 
	
		
			
				|  |  | +        5: 'branchHeight',
 | 
	
		
			
				|  |  | +        6: 'unit',
 | 
	
		
			
				|  |  | +        7: 'taxPrice',
 | 
	
		
			
				|  |  | +        8: 'noTaxPrice',
 | 
	
		
			
				|  |  | +        9: 'remark',
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +    const data = [];
 | 
	
		
			
				|  |  | +    let cur;
 | 
	
		
			
				|  |  | +    const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 11 !== 0); // 排除表头和序号列
 | 
	
		
			
				|  |  | +    $tdList.each(function (index) {
 | 
	
		
			
				|  |  | +        const col = index % 10;
 | 
	
		
			
				|  |  | +        if (col === 0) {
 | 
	
		
			
				|  |  | +            cur = {}
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        cur[colMap[col]] = $page(this).text();
 | 
	
		
			
				|  |  | +        if (col === 9) {
 | 
	
		
			
				|  |  | +            data.push(cur);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    });
 | 
	
		
			
				|  |  | +    console.log(data);
 | 
	
		
			
				|  |  | +    return data;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取表格数据,表格列为:
 | 
	
		
			
				|  |  | + * 序号 | 所属区县 | 材料名称 | 规格及型号 | 计量单位 | 含税价(元) | 不含税价(元)
 | 
	
		
			
				|  |  | + * @param {Object} $page - 页面内容
 | 
	
		
			
				|  |  | + * @param {String} viewSelector - 表格选择器(ID)
 | 
	
		
			
				|  |  | + * @return {Array[object]}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function crawlAreaTable($page, viewSelector) {
 | 
	
		
			
				|  |  | +    const colMap = {
 | 
	
		
			
				|  |  | +        0: 'area',
 | 
	
		
			
				|  |  | +        1: 'name',
 | 
	
		
			
				|  |  | +        2: 'specs',
 | 
	
		
			
				|  |  | +        3: 'unit',
 | 
	
		
			
				|  |  | +        4: 'taxPrice',
 | 
	
		
			
				|  |  | +        5: 'noTaxPrice',
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +    const data = [];
 | 
	
		
			
				|  |  | +    let cur;
 | 
	
		
			
				|  |  | +    const $tdList = $page(viewSelector).find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
 | 
	
		
			
				|  |  | +    $tdList.each(function (index) {
 | 
	
		
			
				|  |  | +        const col = index % 6;
 | 
	
		
			
				|  |  | +        if (col === 0) {
 | 
	
		
			
				|  |  | +            cur = {}
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        cur[colMap[col]] = $page(this).text();
 | 
	
		
			
				|  |  | +        if (col === 5) {
 | 
	
		
			
				|  |  | +            data.push(cur);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    });
 | 
	
		
			
				|  |  | +    console.log(data);
 | 
	
		
			
				|  |  | +    return data;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// 事件触发类型
 | 
	
		
			
				|  |  | +const EventTarget = {
 | 
	
		
			
				|  |  | +    GENERAL_NEXT: 'ID_ucPrice$UcPager1$btnNext',
 | 
	
		
			
				|  |  | +    AREA_NEXT: 'ID_ucAreaPrice$UcPager1$btnNext',
 | 
	
		
			
				|  |  | +    MIXED_NEXT: 'ID_ucReadyMixedPrice_UcPager1_btnNext',
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取一页一页的表格数据
 | 
	
		
			
				|  |  | + * @param {Object} $index - 索引页面内容
 | 
	
		
			
				|  |  | + * @param {Object} props - 提交的表单内容
 | 
	
		
			
				|  |  | + * @param {String} pageType - 页面类型
 | 
	
		
			
				|  |  | + * @param {Number} tableType - 表格类型
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function crawlPagesData($index, props, pageType, tableType) {
 | 
	
		
			
				|  |  | +    let body;
 | 
	
		
			
				|  |  | +    let pageStateSelector;
 | 
	
		
			
				|  |  | +    if (pageType === PageType.GENERAL) {
 | 
	
		
			
				|  |  | +        body = getGeneralDataBody($index, props);
 | 
	
		
			
				|  |  | +        pageStateSelector = '#ID_ucPrice_UcPager1_lbPage';
 | 
	
		
			
				|  |  | +    } else if (pageType === PageType.AREA) {
 | 
	
		
			
				|  |  | +        body = getAreaDataBody($index, props);
 | 
	
		
			
				|  |  | +        pageStateSelector = '#ID_ucAreaPrice_UcPager1_lbPage';
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +        body = getMixedDataBody($index, props);
 | 
	
		
			
				|  |  | +        pageStateSelector = '#ID_ucReadyMixedPrice_UcPager1_lbPage';
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    const $firstPage = await loadPage(pageType, body);
 | 
	
		
			
				|  |  | +    const rst = [];
 | 
	
		
			
				|  |  | +    // 获取第一页数据
 | 
	
		
			
				|  |  | +    rst.push(...crawlTableData($firstPage, tableType));
 | 
	
		
			
				|  |  | +    // 获取除第一页的数据
 | 
	
		
			
				|  |  | +    // 获取页码
 | 
	
		
			
				|  |  | +    const pageState = $firstPage(pageStateSelector).text(); // eg: 1/10
 | 
	
		
			
				|  |  | +    const totalPage = +pageState.split('/')[1];
 | 
	
		
			
				|  |  | +    const asyncCount = 6; // 最高批量次数
 | 
	
		
			
				|  |  | +    let curCount = 0;
 | 
	
		
			
				|  |  | +    let task = [];
 | 
	
		
			
				|  |  | +    for (let page = 1; page < totalPage; page++) {
 | 
	
		
			
				|  |  | +        task.push(crawlPageData(page));
 | 
	
		
			
				|  |  | +        curCount++;
 | 
	
		
			
				|  |  | +        if (curCount === asyncCount) {
 | 
	
		
			
				|  |  | +            const allData = await Promise.all(task);
 | 
	
		
			
				|  |  | +            allData.forEach(data => rst.push(...data));
 | 
	
		
			
				|  |  | +            curCount = 0;
 | 
	
		
			
				|  |  | +            task = [];
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    if (task.length) {
 | 
	
		
			
				|  |  | +        const allData = await Promise.all(task);
 | 
	
		
			
				|  |  | +        allData.forEach(data => rst.push(...data));
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return rst;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // 爬取页码数据
 | 
	
		
			
				|  |  | +    async function crawlPageData(page) {
 | 
	
		
			
				|  |  | +        const pageProps = { ...props, page };
 | 
	
		
			
				|  |  | +        let body;
 | 
	
		
			
				|  |  | +        if (pageType === PageType.GENERAL) {
 | 
	
		
			
				|  |  | +            pageProps.eventTarget = EventTarget.GENERAL_NEXT;
 | 
	
		
			
				|  |  | +            body = getGeneralDataBody($firstPage, pageProps);
 | 
	
		
			
				|  |  | +        } else if (pageType === PageType.AREA) {
 | 
	
		
			
				|  |  | +            pageProps.eventTarget = EventTarget.AREA_NEXT;
 | 
	
		
			
				|  |  | +            body = getAreaDataBody($firstPage, pageProps);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +            pageProps.eventTarget = EventTarget.MIXED_NEXT;
 | 
	
		
			
				|  |  | +            body = getMixedDataBody($firstPage, pageProps);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        const $page = await loadPage(pageType, body);
 | 
	
		
			
				|  |  | +        return crawlTableData($page, tableType);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取建安工程材料和绿色、园林绿化工程材料、节能建筑工程材料
 | 
	
		
			
				|  |  | + * @param {String} period - 期数uid
 | 
	
		
			
				|  |  | + * @param {String} classID - 工程分类id 
 | 
	
		
			
				|  |  | + * @param {Object} $index - 初始页面内容
 | 
	
		
			
				|  |  | + * @param {Number} type - 表格类型
 | 
	
		
			
				|  |  | + * @return {Array[object]} eg: [{ materialClass: '一、黑色及有色金属', items: [...] }]
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function crawlGeneralSubData(period, classID, $index, type) {
 | 
	
		
			
				|  |  | +    const body = getGeneralDataBody($index, { period, classID });
 | 
	
		
			
				|  |  | +    console.time('crawlGeneralSubData');
 | 
	
		
			
				|  |  | +    const $engineeringClassPage = await loadPage(PageType.GENERAL, body);
 | 
	
		
			
				|  |  | +    const rst = [];
 | 
	
		
			
				|  |  | +    if (type === TableType.BUILDING) {
 | 
	
		
			
				|  |  | +        const classList = crawlMaterialClassList($index('#ID_ucPrice_CategoryLabel'));
 | 
	
		
			
				|  |  | +        if (!classList.length) {
 | 
	
		
			
				|  |  | +            throw '无法爬取到材料分类。';
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        console.log(classList);
 | 
	
		
			
				|  |  | +        for (const materialClass of classList) {
 | 
	
		
			
				|  |  | +            const obj = { materialClass, items: [] };
 | 
	
		
			
				|  |  | +            obj.items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass }, PageType.GENERAL, type);
 | 
	
		
			
				|  |  | +            rst.push(obj);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +        const items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass: '' }, PageType.GENERAL, type);
 | 
	
		
			
				|  |  | +        rst.push(...items);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    console.timeEnd('crawlGeneralSubData');
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // 爬取材料分类表
 | 
	
		
			
				|  |  | +    function crawlMaterialClassList($class) {
 | 
	
		
			
				|  |  | +        const list = [];
 | 
	
		
			
				|  |  | +        $class.find('a').each(function () {
 | 
	
		
			
				|  |  | +            const text = $engineeringClassPage(this).text();
 | 
	
		
			
				|  |  | +            list.push(text);
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +        return list;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取主要材料信息价格(这部分作为通用库)
 | 
	
		
			
				|  |  | + * @param {String} period - 期数uid
 | 
	
		
			
				|  |  | + * @param {Object} $index - 初始页面内容
 | 
	
		
			
				|  |  | + * @return {Object}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function crawlGeneralData(period, $index) {
 | 
	
		
			
				|  |  | +    const { building, garden, energy } = crawlClass($index('#ID_ucPrice_tabNewBar'));
 | 
	
		
			
				|  |  | +    const rst = {};
 | 
	
		
			
				|  |  | +    if (building) {
 | 
	
		
			
				|  |  | +        rst.building = await crawlGeneralSubData(period, building, $index, TableType.BUILDING);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    if (garden) {
 | 
	
		
			
				|  |  | +        // 园林绿化工程材料下的数据所属分类为数据的"科属"列
 | 
	
		
			
				|  |  | +        rst.garden = await crawlGeneralSubData(period, garden, $index, TableType.GARDEN);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    if (energy) {
 | 
	
		
			
				|  |  | +        // 绿色、节能建筑工程材料下的所有数据,所属分类均为“绿色、节能建筑工程材料”。
 | 
	
		
			
				|  |  | +        rst.energy = await crawlGeneralSubData(period, energy, $index, TableType.ENERGY);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return rst;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // 爬取工程分类
 | 
	
		
			
				|  |  | +    function crawlClass($class) {
 | 
	
		
			
				|  |  | +        // 工程分类
 | 
	
		
			
				|  |  | +        let building; // 建安工程材料
 | 
	
		
			
				|  |  | +        let garden; // 园林绿化工程材料
 | 
	
		
			
				|  |  | +        let energy; // 绿色、节能建筑工程材料
 | 
	
		
			
				|  |  | +        const reg = /OnClassson\('([^']+)'/;
 | 
	
		
			
				|  |  | +        $class.find('a').each(function () {
 | 
	
		
			
				|  |  | +            const text = $index(this).text();
 | 
	
		
			
				|  |  | +            const onclickText = $index(this).attr('onclick').toString();
 | 
	
		
			
				|  |  | +            const matched = onclickText.match(reg);
 | 
	
		
			
				|  |  | +            if (!matched || !matched[1]) {
 | 
	
		
			
				|  |  | +                throw '无法爬取到工程分类。';
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            if (text === '建安工程材料') {
 | 
	
		
			
				|  |  | +                building = matched[1];
 | 
	
		
			
				|  |  | +            } else if (text === '园林绿化工程材料') {
 | 
	
		
			
				|  |  | +                garden = matched[1];
 | 
	
		
			
				|  |  | +            } else if (text === '绿色、节能建筑工程材料') {
 | 
	
		
			
				|  |  | +                energy = matched[1];
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +        return { building, garden, energy };
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取各区县地方材料工地价格
 | 
	
		
			
				|  |  | + * @param {String} period - 期数uid
 | 
	
		
			
				|  |  | + * @return {Array[objecy]
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function crawlAreaData(period) {
 | 
	
		
			
				|  |  | +    // 获取各区材料初始页
 | 
	
		
			
				|  |  | +    const $index = await loadPage(PageType.AREA);
 | 
	
		
			
				|  |  | +    // 获取地区材料
 | 
	
		
			
				|  |  | +    return await crawlPagesData($index, { period }, PageType.AREA, TableType.AREA);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取预拌砂浆信息价格
 | 
	
		
			
				|  |  | + * @param {String} period - 期数uid
 | 
	
		
			
				|  |  | + * @return {Array[objecy]
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function crawlMixedData(period) {
 | 
	
		
			
				|  |  | +    // 获取各区材料初始页
 | 
	
		
			
				|  |  | +    const $index = await loadPage(PageType.MIXED);
 | 
	
		
			
				|  |  | +    // 获取地区材料
 | 
	
		
			
				|  |  | +    return await crawlPagesData($index, { period }, PageType.MIXED, TableType.MIXED);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 
 | 
	
		
			
				|  |  | + * @param {String} period 期数 eg: '2020-05'
 | 
	
		
			
				|  |  | + * @param {Object} generalData - 主要材料{ building, garden, energy }
 | 
	
		
			
				|  |  | + * @param {Array[object]} areaData - 各地区材料
 | 
	
		
			
				|  |  | + * @param {Array[object]} mixedData - 各地区预拌砂浆
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +function transfromAndSave(period, generalData, areaData, mixedData) {
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * 爬取数据
 | 
	
		
			
				|  |  | + * @param {String} from - 从哪一期开始 eg: 2020-01
 | 
	
		
			
				|  |  | + * @param {String} to - 从哪一期结束 eg: 2020-05
 | 
	
		
			
				|  |  | + * @return {Object}
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +async function crawlData(from, to) {
 | 
	
		
			
				|  |  | +    let curPeriod;
 | 
	
		
			
				|  |  | +    try {
 | 
	
		
			
				|  |  | +        const $index = await loadPage(PageType.GENERAL);
 | 
	
		
			
				|  |  | +        const periodData = getPeriodData(from, to, $index);
 | 
	
		
			
				|  |  | +        if (!periodData) {
 | 
	
		
			
				|  |  | +            throw '无效的期数区间。';
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        console.log(periodData);
 | 
	
		
			
				|  |  | +        // 一期一期爬取数据
 | 
	
		
			
				|  |  | +        for (const periodItem of periodData) {
 | 
	
		
			
				|  |  | +            // 爬取主要材料信息价格
 | 
	
		
			
				|  |  | +            const generalData = await crawlGeneralData(periodItem.uid, $index); // 初始页面就是主要材料信息价的页面
 | 
	
		
			
				|  |  | +            // 爬取各区县地方材料工地价格
 | 
	
		
			
				|  |  | +            const areaData = await crawlAreaData(periodItem.uid);
 | 
	
		
			
				|  |  | +            // 爬取预拌砂浆信息价格
 | 
	
		
			
				|  |  | +            const mixedData = await crawlMixedData(periodItem.uid);
 | 
	
		
			
				|  |  | +            
 | 
	
		
			
				|  |  | +            curPeriod = periodItem.period;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    } catch (err) {
 | 
	
		
			
				|  |  | +        // 错误时提示已经成功爬取的期数
 | 
	
		
			
				|  |  | +        let errTip = '';
 | 
	
		
			
				|  |  | +        if (curPeriod) {
 | 
	
		
			
				|  |  | +            errTip += `\n成功爬取期数为:${from}到${curPeriod}`;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        const errStr = String(err) + errTip;
 | 
	
		
			
				|  |  | +        console.log(`err`);
 | 
	
		
			
				|  |  | +        console.log(errStr);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 |