123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044 |
- /**
- * @author vian
- * 重庆材料信息价爬虫
- * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的数据是ssr的静态内容,因此不需要使用puppeteer。
- * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据)
- */
- module.exports = {
- crawlData,
- };
- const cheerio = require('cheerio');
- const axios = require('axios');
- const querystring = require('querystring');
- const uuidV1 = require('uuid/v1');
- const mongoose = require('mongoose');
- const { isDef } = require('../../../public/common_util');
- const compilationModel = mongoose.model('compilation');
- const priceInfoLibModel = mongoose.model('std_price_info_lib');
- const priceInfoClassModel = mongoose.model('std_price_info_class');
- const priceInfoItemModel = mongoose.model('std_price_info_items');
- const priceInfoAreaModel = mongoose.model('std_price_info_areas');
- const isDebug = true;
- function debugConsole(str, type = 'log') {
- if (isDebug) {
- console[type](str);
- }
- }
- const areas = [
- '主城区',
- '渝中区',
- '江北区',
- '沙坪坝区',
- '南岸区',
- '九龙坡区',
- '大渡口区',
- '北碚区',
- '渝北区',
- '巴南区',
- '万州区',
- '涪陵区',
- '万盛区',
- '双桥区',
- '黔江区',
- '长寿区',
- '江津区',
- '合川区',
- '永川区',
- '南川区',
- '綦江县',
- '潼南县',
- '铜梁县',
- '大足县',
- '荣昌县',
- '璧山县',
- '梁平区',
- '城口县',
- '丰都县',
- '垫江县',
- '忠县',
- '开州区',
- '云阳县',
- '奉节县',
- '巫山县',
- '巫溪县',
- '石柱县',
- '秀山县',
- '酉阳县',
- '彭水县',
- '大足区',
- '綦江区',
- '万盛经开区',
- '双桥经开区',
- '铜梁区',
- '璧山区',
- '荣昌县1',
- '荣昌县2',
- '彭水县1',
- '彭水县2',
- '彭水县3',
- '潼南区',
- '荣昌区1',
- '荣昌区2',
- '武隆区',
- '武隆区1',
- '武隆区2',
- '武隆区3',
- '武隆区4',
- '武隆区5',
- '武隆区6',
- ];
- // 页面类型
- const PageType = {
- GENERAL: '/Index.aspx',
- AREA: '/AreaIndex.aspx',
- MIXED: '/ReadyMixedIndex.aspx',
- };
- /**
- * 获取主要材料信息价格页面表单数据
- * @param {Object} $ - 页面内容
- * @param {Object} props - 提交属性
- */
- function getGeneralDataBody($, props) {
- const body = {
- __EVENTTARGET: props.eventTarget || '',
- __EVENTARGUMENT: '',
- __VIEWSTATE: $('#__VIEWSTATE').val(),
- __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
- ID_ucPrice$linkvv: props.period,
- ID_ucPrice$linkcategory: props.materialClass || '',
- ID_ucPrice$LinkValue: `${props.classID},${props.period},${props.materialClass || ''}`,
- ID_ucPrice$txtsonclass: `sonclass${props.classID}`,
- ID_ucPrice$txtfatherclass: $('#ID_ucPrice_txtfatherclass').val(),
- ID_ucPrice$txtClassId: props.classID || '',
- ID_ucPrice$ddlSearchYear: '请选择',
- ID_ucPrice$ddlSearchMonth: '请选择',
- ID_ucPrice$txtSearchCailiao: '',
- ID_ucPrice$UcPager1$listPage: props.page && String(props.page) || '1',
- };
- if (!props.eventTarget) {
- body.ID_ucPrice$btnLink = $('#ID_ucPrice_btnLink').val();
- }
- return body;
- }
- /**
- * 获取各区县地方材料工地价格页面表单数据
- * @param {Object} $ - 页面内容
- * @param {Object} props - 提交属性
- */
- function getAreaDataBody($, props) {
- if (!props || !Object.keys(props).length) {
- return {};
- }
- const body = {
- __EVENTTARGET: props.eventTarget || '',
- __EVENTARGUMENT: '',
- __VIEWSTATE: $('#__VIEWSTATE').val(),
- __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
- ID_ucAreaPrice$linkvv: props.period,
- ID_ucAreaPrice$LinkValue: '',
- ID_ucAreaPrice$dropArea: 'code',
- ID_ucAreaPrice$txtSearchCailiao: '',
- ID_ucAreaPrice$UcPager1$listPage: props.page && String(props.page) || '1',
- };
- if (!props.eventTarget) {
- body.ID_ucAreaPrice$btnAreaMaster = 'Button';
- }
- return body;
- }
- /**
- * 获取预拌砂浆信息价格页面表单数据
- * @param {Object} $ - 页面内容
- * @param {Object} props - 提交属性
- */
- function getMixedDataBody($, props) {
- if (!props || !Object.keys(props).length) {
- return {};
- }
- const body = {
- __EVENTTARGET: props.eventTarget || '',
- __EVENTARGUMENT: '',
- __VIEWSTATE: $('#__VIEWSTATE').val(),
- __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
- ID_ucReadyMixedPrice$linkvv: props.period,
- ID_ucReadyMixedPrice$LinkValue: '',
- ID_ucReadyMixedPrice$dropArea: 'code',
- ID_ucReadyMixedPrice$txtSearchCailiao: '',
- ID_ucReadyMixedPrice$UcPager1$listPage: props.page && String(props.page) || '1',
- };
- if (!props.eventTarget) {
- body.ID_ucReadyMixedPrice$btnAreaMaster = 'Button';
- }
- return body;
- }
- // 获取提交
- const TIME_OUT = 60000;
- // 创建axios实例
- const axiosInstance = axios.create({
- baseURL: 'http://www.cqsgczjxx.org/Jgxx/',
- timeout: TIME_OUT,
- /* proxy: {
- host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error
- }, */
- headers: {
- 'Cache-Control': 'max-age=0',
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'Accept-Encoding': 'gzip, deflate',
- 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
- },
- responseType: 'document'
- });
- // 响应拦截器
- axiosInstance.interceptors.response.use(function (response) {
- return response;
- }, function (error) {
- // 对响应错误做点什么
- if (error.message.includes('timeout')) {
- return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
- } else {
- return Promise.reject(error);
- }
- });
- // 发起请求需要携带Cookie,否则一些请求会返回500错误(应该是网站的反爬措施)
- let curCookie = '';
- /**
- * 加载页面,获取可用类jquery操作的数据
- * @param {String} url - 拼接的url
- * @param {Object} body - 表单数据
- * @return {DOM-LIKE} - cheerio解析html得到的类dom数据
- */
- async function loadPage(url, body) {
- const config = {};
- if (curCookie) {
- config.headers = { Cookie: curCookie };
- }
- const rst = body ?
- await axiosInstance.post(url, querystring.stringify(body), config) :
- await axiosInstance.post(url, null, config);
- // 更新cookie
- const cookies = rst.headers['set-cookie'];
- if (Object.prototype.toString.call(cookies) === '[object Array]') {
- curCookie = cookies[0].split(';')[0];
- }
- return cheerio.load(rst.data);
- }
- const monthMap = {
- '1': '01月',
- '2': '02月',
- '3': '03月',
- '4': '04月',
- '5': '05月',
- '6': '06月',
- '7': '07月',
- '8': '08月',
- '9': '09月',
- '10': '10月',
- '11': '11月',
- '12': '12月',
- };
- /**
- * 获取期数数据
- * @param {String} from - 从哪一期开始 eg: 2020-01
- * @param {String} to - 从哪一期结束 eg: 2020-05
- * @param {Object} $index - cheerio加载的初始页面内容
- * @return {Array<object> || Null} eg: {period: '2020-05', uid: 'XCCXXXXX-XX'}
- */
- function getPeriodData(from, to, $index) {
- if (from > to) {
- return null;
- }
- const $period = $index('#PriceLMenu')
- // 根据区间获取期数列表
- const reg = /(\d+)-(\d+)/;
- const fromMatch = from.match(reg);
- const fromYear = +fromMatch[1];
- const fromMonth = +fromMatch[2];
- const toMatch = to.match(reg);
- const toYear = +toMatch[1];
- const toMonth = +toMatch[2];
- let curYear = fromYear;
- let curMonth = fromMonth;
- const list = [];
- while (curYear <= toYear && curMonth <= toMonth) {
- const uid = getPeriodUID(curYear, curMonth, $period);
- // 存在无效期数,直接返回空
- if (!uid) {
- return null;
- }
- list.push({
- period: `${curYear}年-${monthMap[curMonth]}`,
- uid
- });
- if (curMonth === 12) {
- curYear++;
- curMonth = 1;
- } else {
- curMonth++;
- }
- }
- return list;
- function getPeriodUID(year, month, $period) {
- const $year = $period.find('.MenuOneTitle').filter(function () {
- return $index(this).text() === `${year}年`;
- });
- if (!$year.length) {
- return null;
- }
- const $month = $year.parent().next().find('a').filter(function () {
- return $index(this).text() === `${month}月`;
- });
- if (!$month.length) {
- return null;
- }
- // 期数uid在onclick中,需要提取出来
- const onclickText = $month.attr('onclick').toString();
- const reg = /Onlink\('([^']+)'/;
- const matched = onclickText.match(reg);
- if (!matched || !matched[1]) {
- return null;
- }
- return matched[1];
- }
- }
- // 表格类型
- const TableType = {
- BUILDING: 1, // 主要材料中的建安工程材料和绿色
- GARDEN: 2, // 主要材料中的园林绿化
- ENERGY: 3, // 主要材料中的节能建筑工程材料
- AREA: 4, // 地区相关(各区县材料)
- MIXED: 5, // 地区相关(预拌砂浆)
- };
- /**
- * 爬取表格数据
- * @param {Object} $page - 页面内容
- * @param {Number} type - 表格类型
- * @return {Array<object>}
- */
- function crawlTableData($page, type) {
- switch (type) {
- case TableType.BUILDING:
- case TableType.ENERGY:
- return crawlNormalTable($page);
- case TableType.GARDEN:
- return crawlGardenTable($page);
- case TableType.AREA:
- return crawlAreaTable($page, '#ID_ucAreaPrice_gridView');
- case TableType.MIXED:
- return crawlAreaTable($page, '#ID_ucReadyMixedPrice_gridView');
- }
- return [];
- }
- /**
- * 爬取表格数据,表格列为:
- * 序号 | 材料名称 | 规格型号 | 单位 | 含税价(元) | 不含税价(元) | 备注
- * @param {Object} $page - 页面内容
- * @return {Array<object>}
- */
- function crawlNormalTable($page) {
- const colMap = {
- 0: 'name',
- 1: 'specs',
- 2: 'unit',
- 3: 'taxPrice',
- 4: 'noTaxPrice',
- 5: 'remark'
- };
- const data = [];
- let cur;
- const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
- $tdList.each(function (index) {
- const col = index % 6;
- if (col === 0) {
- cur = {}
- }
- cur[colMap[col]] = $page(this).text();
- if (col === 5) {
- data.push(cur);
- }
- });
- debugConsole(data);
- return data;
- }
- /**
- * 爬取表格数据,表格列为:
- * 序号 | 科属 | 品名 | 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 单位 | 含税价(元) | 不含税价(元) | 备注
- * @param {Object} $page - 页面内容
- * @return {Array<object>}
- */
- function crawlGardenTable($page) {
- const colMap = {
- 0: 'genera',
- 1: 'name',
- 2: 'height',
- 3: 'branchDiameter',
- 4: 'crownDiameter',
- 5: 'branchHeight',
- 6: 'unit',
- 7: 'taxPrice',
- 8: 'noTaxPrice',
- 9: 'remark',
- };
- const data = [];
- let cur;
- const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 11 !== 0); // 排除表头和序号列
- $tdList.each(function (index) {
- const col = index % 10;
- if (col === 0) {
- cur = {}
- }
- cur[colMap[col]] = $page(this).text();
- if (col === 9) {
- data.push(cur);
- }
- });
- debugConsole(data);
- return data;
- }
- /**
- * 爬取表格数据,表格列为:
- * 序号 | 所属区县 | 材料名称 | 规格及型号 | 计量单位 | 含税价(元) | 不含税价(元)
- * @param {Object} $page - 页面内容
- * @param {String} viewSelector - 表格选择器(ID)
- * @return {Array<object>}
- */
- function crawlAreaTable($page, viewSelector) {
- const colMap = {
- 0: 'area',
- 1: 'name',
- 2: 'specs',
- 3: 'unit',
- 4: 'taxPrice',
- 5: 'noTaxPrice',
- };
- const data = [];
- let cur;
- const $tdList = $page(viewSelector).find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
- $tdList.each(function (index) {
- const col = index % 6;
- if (col === 0) {
- cur = {}
- }
- cur[colMap[col]] = $page(this).text();
- if (col === 5) {
- data.push(cur);
- }
- });
- debugConsole(data);
- return data;
- }
- // 事件触发类型
- const EventTarget = {
- GENERAL_NEXT: 'ID_ucPrice$UcPager1$btnNext',
- AREA_NEXT: 'ID_ucAreaPrice$UcPager1$btnNext',
- MIXED_NEXT: 'ID_ucReadyMixedPrice_UcPager1_btnNext',
- };
- /**
- * 爬取一页一页的表格数据
- * @param {Object} $index - 索引页面内容
- * @param {Object} props - 提交的表单内容
- * @param {String} pageType - 页面类型
- * @param {Number} tableType - 表格类型
- */
- async function crawlPagesData($index, props, pageType, tableType) {
- let body;
- let pageStateSelector;
- if (pageType === PageType.GENERAL) {
- body = getGeneralDataBody($index, props);
- pageStateSelector = '#ID_ucPrice_UcPager1_lbPage';
- } else if (pageType === PageType.AREA) {
- body = getAreaDataBody($index, props);
- pageStateSelector = '#ID_ucAreaPrice_UcPager1_lbPage';
- } else {
- body = getMixedDataBody($index, props);
- pageStateSelector = '#ID_ucReadyMixedPrice_UcPager1_lbPage';
- }
- const $firstPage = await loadPage(pageType, body);
- const rst = [];
- // 获取第一页数据
- rst.push(...crawlTableData($firstPage, tableType));
- if (!rst.length) { // 第一页都没数据,后续不需要操作了
- return rst;
- }
- // 获取除第一页的数据
- // 获取页码
- const pageState = $firstPage(pageStateSelector).text(); // eg: 1/10
- const totalPage = +pageState.split('/')[1];
- const asyncCount = 6; // 最高批量次数
- let curCount = 0;
- let task = [];
- for (let page = 1; page < totalPage; page++) {
- task.push(crawlPageData(page));
- curCount++;
- if (curCount === asyncCount) {
- const allData = await Promise.all(task);
- allData.forEach(data => rst.push(...data));
- curCount = 0;
- task = [];
- }
- }
- if (task.length) {
- const allData = await Promise.all(task);
- allData.forEach(data => rst.push(...data));
- }
- return rst;
- // 爬取页码数据
- async function crawlPageData(page) {
- const pageProps = { ...props, page };
- let body;
- if (pageType === PageType.GENERAL) {
- pageProps.eventTarget = EventTarget.GENERAL_NEXT;
- body = getGeneralDataBody($firstPage, pageProps);
- } else if (pageType === PageType.AREA) {
- pageProps.eventTarget = EventTarget.AREA_NEXT;
- body = getAreaDataBody($firstPage, pageProps);
- } else {
- pageProps.eventTarget = EventTarget.MIXED_NEXT;
- body = getMixedDataBody($firstPage, pageProps);
- }
- const $page = await loadPage(pageType, body);
- return crawlTableData($page, tableType);
- }
- }
- /**
- * 爬取建安工程材料和绿色、园林绿化工程材料、节能建筑工程材料
- * @param {String} period - 期数uid
- * @param {String} classID - 工程分类id
- * @param {Object} $index - 初始页面内容
- * @param {Number} type - 表格类型
- * @return {Array<object>} eg: [{ materialClass: '一、黑色及有色金属', items: [...] }]
- */
- async function crawlGeneralSubData(period, classID, $index, type) {
- const body = getGeneralDataBody($index, { period, classID });
- const $engineeringClassPage = await loadPage(PageType.GENERAL, body);
- const rst = [];
- if (type === TableType.BUILDING) {
- const classList = crawlMaterialClassList($index('#ID_ucPrice_CategoryLabel'));
- if (!classList.length) {
- throw '无法爬取到材料分类。';
- }
- const reg = /[一二三四五六七八九十]+、/;
- for (const materialClass of classList) {
- const obj = { materialClass: materialClass.replace(reg, ''), items: [] }; // 材料分类去除序号
- obj.items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass }, PageType.GENERAL, type);
- rst.push(obj);
- }
- } else {
- const items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass: '' }, PageType.GENERAL, type);
- rst.push(...items);
- }
- return rst;
- // 爬取材料分类表
- function crawlMaterialClassList($class) {
- const list = [];
- $class.find('a').each(function () {
- const text = $engineeringClassPage(this).text();
- list.push(text);
- });
- return list;
- }
- }
- /**
- * 爬取主要材料信息价格(这部分作为通用库)
- * @param {String} period - 期数uid
- * @param {Object} $index - 初始页面内容
- * @return {Object}
- */
- async function crawlGeneralData(period, $index) {
- const { building, garden, energy } = crawlClass($index('#ID_ucPrice_tabNewBar'));
- const rst = {};
- if (building) {
- rst.building = await crawlGeneralSubData(period, building, $index, TableType.BUILDING);
- }
- if (garden) {
- // 园林绿化工程材料下的数据所属分类为数据的"科属"列
- rst.garden = await crawlGeneralSubData(period, garden, $index, TableType.GARDEN);
- }
- if (energy) {
- // 绿色、节能建筑工程材料下的所有数据,所属分类均为“绿色、节能建筑工程材料”。
- rst.energy = await crawlGeneralSubData(period, energy, $index, TableType.ENERGY);
- }
- return rst;
- // 爬取工程分类
- function crawlClass($class) {
- // 工程分类
- let building; // 建安工程材料
- let garden; // 园林绿化工程材料
- let energy; // 绿色、节能建筑工程材料
- const reg = /OnClassson\('([^']+)'/;
- $class.find('a').each(function () {
- const text = $index(this).text();
- const onclickText = $index(this).attr('onclick').toString();
- const matched = onclickText.match(reg);
- if (!matched || !matched[1]) {
- throw '无法爬取到工程分类。';
- }
- if (text === '建安工程材料') {
- building = matched[1];
- } else if (text === '园林绿化工程材料') {
- garden = matched[1];
- } else if (text === '绿色、节能建筑工程材料') {
- energy = matched[1];
- }
- });
- return { building, garden, energy };
- }
- }
- /**
- * 爬取各区县地方材料工地价格
- * @param {String} period - 期数uid
- * @return {Array<object>}
- */
- async function crawlAreaData(period) {
- // 获取各区材料初始页
- const $index = await loadPage(PageType.AREA);
- // 获取地区材料
- return await crawlPagesData($index, { period }, PageType.AREA, TableType.AREA);
- }
- /**
- * 爬取预拌砂浆信息价格
- * @param {String} period - 期数uid
- * @return {Array<object>}
- */
- async function crawlMixedData(period) {
- // 获取各区材料初始页
- const $index = await loadPage(PageType.MIXED);
- // 获取地区材料
- return await crawlPagesData($index, { period }, PageType.MIXED, TableType.MIXED);
- }
- /**
- * 转换价格数据(一条源数据可能需要分割成多条数据)
- * @param {String} libID - 库ID
- * @param {String} classID - 所属分类ID
- * @param {String} period - 期数 eg:2020年01月
- * @param {String} areaID - 地区ID
- * @param {String} compilationID - 费用定额ID
- * @param {Array<object>} items - 爬取的信息价源数据
- * @param {Number} tableType - 表格类型
- * @return {Array<obejct>}
- */
- function transformPriceItems(libID, classID, period, areaID, compilationID, items, tableType) {
- const rst = [];
- if (tableType === TableType.GARDEN) {
- // 有的数据 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 不含税价(元) = ‘’ | 14-17 | 大于400 | 200-300 | 430-780
- // 则此数据需要分为:
- // 1. { name: 名称-最低价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 430 }
- // 2. { name: 名称-最高价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 780 }
- const unit = 'CM';
- const duplicateReg = /-/;
- items.forEach(item => {
- // 拼接规格型号
- const specsList = [];
- if (item.height) {
- specsList.push(`高度${item.height}${unit}`);
- }
- if (item.branchDiameter) {
- specsList.push(`干径${item.branchDiameter}${unit}`);
- }
- if (item.crownDiameter) {
- specsList.push(`冠径${item.crownDiameter}${unit}`);
- }
- if (item.branchHeight) {
- specsList.push(`分枝高${item.branchHeight}${unit}`);
- }
- const specs = specsList.join(' ');
- // 分成最高低价最高价数据
- const isDuplicate = duplicateReg.test(item.taxPrice) || duplicateReg.test(item.noTaxPrice);
- if (isDuplicate) {
- const taxPriceList = item.taxPrice.split('-');
- const noTaxPriceList = item.noTaxPrice.split('-');
- const minItem = {
- ...item,
- name: `${item.name}-最低价`,
- specs,
- taxPrice: taxPriceList[0],
- noTaxPrice: noTaxPriceList[0]
- };
- const maxItem = {
- ...item,
- name: `${item.name}-最高价`,
- specs,
- taxPrice: taxPriceList[1] || '',
- noTaxPrice: noTaxPriceList[1] || ''
- };
- rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, minItem));
- rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, maxItem));
- } else {
- rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, item));
- }
- })
- } else {
- const duplicateReg = /\//;
- // 有的数据:规格型号 | 含税价(元) | 不含税价(元) = φ6(6.5)/φ8 HPB300 | 4030.00/3880.00 | 3566.37/3433.63,则这条数据需要分成两条数据
- items.forEach(item => {
- item.taxPrice = item.taxPrice === '-' ? '' : item.taxPrice;
- item.noTaxPrice = item.noTaxPrice === '-' ? '' : item.noTaxPrice;
- const isDuplicate = duplicateReg.test(item.taxPrice) || duplicateReg.test(item.noTaxPrice); // 以价格被分割,作为数据需要分割的判断
- if (isDuplicate) {
- // 提取规格型号分割部分和公共部分:Q390/Q420 δ=20-30 => Q390 δ=20-30; Q420 δ=20-30
- // 获取公共规格型号部分
- const commonReg = /\s+([^/]*)$/;
- const commonMatched = item.specs.match(commonReg);
- const commonSpecs = commonMatched && commonMatched[1] ? ' ' + commonMatched[1] : '';
- // 获取分割规格型号
- const specsList = item.specs
- .replace(commonReg, '')
- .split('/');
- const taxPriceList = item.taxPrice.split('/');
- const noTaxPriceList = item.noTaxPrice.split('/');
- specsList.forEach((specs, index) => {
- const newItem = {
- ...item,
- specs: `${specs}${commonSpecs}`,
- taxPrice: taxPriceList[index] || taxPriceList[0],
- noTaxPrice: noTaxPriceList[index] || noTaxPriceList[0]
- };
- if (areaID) {
- newItem.areaID = areaID;
- }
- rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, newItem));
- });
- } else {
- rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, item));
- }
- });
- }
- return rst;
- }
- // 转换单条的价格数据
- function transfromPriceItem(libID, classID, period, areaID, compilationID, item) {
- // 源数据中的规格型号存在多个无意义的空格,合并为一个
- const reg = /\s{2,}/g;
- item.specs = item.specs ? item.specs.replace(reg, ' ') : '';
- return {
- ID: uuidV1(),
- libID,
- classID,
- code: '',
- name: item.name,
- specs: item.specs,
- unit: item.unit,
- taxPrice: item.taxPrice,
- noTaxPrice: item.noTaxPrice,
- remark: item.remark || '',
- // 以下冗余数据为方便前台信息价功能处理
- period,
- areaID,
- compilationID,
- }
- }
- /**
- * 转换主要材料
- * @param {String} period - 日期: 2020年01月
- * @param {String} compilationID - 费用定额ID
- * @param {Object} generalData - 主要材料{ building, garden, energy }
- * @return {Object} { libData, classData, priceData, compilationAreas }
- */
- async function transfromGeneralData(period, compilationID, generalData) {
- const area = '通用';
- // 爬取数据的时候,地区数据先匹配名称,如果费用定额已有此地区,不新增
- const matchedArea = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
- // 地区需要serialNo字段,打补丁
- if (matchedArea && !matchedArea.serialNo) {
- await priceInfoAreaModel.update({ ID: matchedArea.ID }, { $set: { serialNo: 1 } });
- }
- const areaID = matchedArea && matchedArea.ID || uuidV1();
- const compilationAreas = [];
- const libData = {
- ID: uuidV1(),
- name: `信息价(${period})`,
- period,
- areas: [],
- compilationID,
- createDate: Date.now(),
- };
- const classData = [];
- let curClassIndex = 0;
- const priceData = [];
- const { building, garden, energy } = generalData;
- handleClassAndItems(building, TableType.BUILDING);
- // 园林分类数据为:苗木-科属(genera)
- const gardenRoot = { materialClass: '苗木', treeData: { ID: uuidV1(), ParentID: '-1' } };
- const gardenData = [gardenRoot];
- garden.forEach(item => {
- const pre = gardenData[gardenData.length - 1];
- if (item.genera !== pre.materialClass) {
- gardenData.push({ materialClass: item.genera, treeData: { ParentID: gardenRoot.treeData.ID }, items: [item] });
- } else {
- pre.items.push(item);
- }
- });
- handleClassAndItems(gardenData, TableType.GARDEN)
- // 绿色节能分类数据:绿色、节能建筑工程材料
- const energyData = [{ materialClass: '绿色、节能建筑工程材料', items: energy }];
- handleClassAndItems(energyData, TableType.ENERGY);
- // 有数据才将地区push入areas中(费用定额共用)
- if ((classData.length || priceData.length) && !matchedArea) {
- compilationAreas.push({ compilationID, ID: areaID, name: area })
- }
- return { libData, classData, priceData, compilationAreas };
- function handleClassAndItems(sourceData, tableType) {
- if (!sourceData) {
- return;
- }
- sourceData.forEach(({ materialClass, treeData, items }) => {
- const classItem = {
- ID: treeData && treeData.ID || uuidV1(),
- ParentID: treeData && treeData.ParentID || '-1',
- NextSiblingID: treeData && treeData.NextSiblingID || '-1',
- name: materialClass,
- libID: libData.ID,
- areaID,
- };
- // 设置上一个节点数据的NextID
- let count = 1;
- let pre = classData[curClassIndex - 1];
- while (pre && pre.ParentID !== classItem.ParentID) {
- count++;
- pre = classData[curClassIndex - count];
- }
- if (pre && pre.ParentID === classItem.ParentID) {
- pre.NextSiblingID = classItem.ID;
- }
- curClassIndex++;
- classData.push(classItem);
- // 转换价格数据
- if (items && items.length) {
- const newItems = transformPriceItems(libData.ID, classItem.ID, period, areaID, compilationID, items, tableType);
- newItems.forEach(item => priceData.push(item));
- }
- });
- }
- }
- /**
- * 转换跟地区相关的数据
- * 地区作为期数库的子项
- * @param {String} period - 日期: 2020年01月
- * @param {String} compilationID - 费用定额ID
- * @param {String} className - 分类名称
- * @param {Object} libData - 当前期数库数据
- * @param {Array<object>} areaData - 各区县地方材料工地价格
- * @param {Array<object>} mixedData - 预拌砂浆信息价格
- * @return {Object}
- */
- async function transformAreaData(period, compilationID, libData, areaData, mixedData) {
- // 根据地区进行分类
- const data = [];
- const hashMap = {}; // 保证地区顺序跟网页爬取数据的顺序一致。(object for in无法保证顺序)
- function hash(area) {
- if (!isDef(hashMap[area])) {
- hashMap[area] = Object.keys(hashMap).length
- }
- return hashMap[area];
- }
- const areaClass = '地方材料信息价';
- const mixedClass = '预拌商品砂浆';
- function buildData(sourceData) {
- sourceData.forEach(item => {
- const idx = hash(item.area);
- if (!data[idx]) {
- data[idx] = { area: item.area, subData: [] };
- }
- if (sourceData === areaData) {
- // 存在地区数据,需要生成分类“地方材料信息价”
- if (!data[idx].subData[0]) {
- data[idx].subData[0] = { className: areaClass, items: [] };
- }
- data[idx].subData[0].items.push(item);
- } else if (sourceData === mixedData) {
- // 存在地区数据,需要生成分类“地方材料信息价”
- if (!data[idx].subData[1]) {
- data[idx].subData[1] = { className: mixedClass, items: [] };
- }
- data[idx].subData[1].items.push(item);
- }
- });
- }
- buildData(areaData);
- buildData(mixedData);
- const compilationAreas = [];
- const classData = [];
- const priceData = [];
- for (const { area, subData } of data) {
- const matchedArea = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
- // 地区需要serialNo字段,打补丁
- const serialNo = areas.indexOf(area) + 1;
- if (matchedArea && !matchedArea.serialNo) {
- await priceInfoAreaModel.update({ ID: matchedArea.ID }, { $set: { serialNo } });
- }
- const areaID = matchedArea && matchedArea.ID || uuidV1();
- if (!matchedArea) {
- compilationAreas.push({ compilationID, serialNo, ID: areaID, name: area });
- }
- let preClass;
- subData.forEach(subItem => {
- if (!subItem) {
- return;
- }
- const { className, items } = subItem;
- const classItem = {
- ID: uuidV1(),
- ParentID: '-1',
- NextSiblingID: '-1',
- name: className,
- libID: libData.ID,
- areaID,
- };
- classData.push(classItem);
- if (preClass) {
- preClass.NextSiblingID = classItem.ID;
- }
- preClass = classItem;
- const newItems = transformPriceItems(libData.ID, classItem.ID, period, areaID, compilationID, items, TableType.AREA);
- newItems.forEach(item => priceData.push(item));
- });
- }
- return { classData, priceData, compilationAreas };
- }
- /**
- * 数据入库
- * 生成一个通用库及各地区
- * @param {String} period 期数 eg: '2020年05月'
- * @param {Object} generalData - 主要材料{ building, garden, energy }
- * @param {Array<object>} areaData - 各地区材料
- * @param {Array<object>} mixedData - 各地区预拌砂浆
- */
- async function save(period, generalData, areaData, mixedData) {
- const overWriteUrl = '/web/over_write/js/chongqing_2018.js';
- const compilation = await compilationModel.findOne({ overWriteUrl }, '_id').lean();
- if (!compilation) {
- throw '没有找到正确配置overWriteUrl的费用定额。';
- }
- const compilationID = compilation._id;
- // 转换数据
- const generalSaveData = await transfromGeneralData(period, compilationID, generalData);
- const libData = generalSaveData.libData;
- const areaSaveData = await transformAreaData(period, compilationID, libData, areaData, mixedData);
- // 入库
- const classData = [...generalSaveData.classData, ...areaSaveData.classData];
- const priceData = [...generalSaveData.priceData, ...areaSaveData.priceData];
- const compilationAreas = [...generalSaveData.compilationAreas, ...areaSaveData.compilationAreas]
- // 删除已有的相同期数数据
- const originalLibs = await priceInfoLibModel.find({ period }, '-_id ID').lean();
- const originalLibIDList = originalLibs.reduce((acc, cur) => {
- acc.push(cur.ID);
- return acc;
- }, []);
- if (originalLibIDList.length) {
- await priceInfoItemModel.deleteMany({ period });
- await priceInfoClassModel.deleteMany({ libID: { $in: originalLibIDList } });
- await priceInfoLibModel.deleteMany({ period });
- }
- // 插入数据
- if (priceData.length) {
- await priceInfoItemModel.insertMany(priceData);
- }
- if (classData.length) {
- await priceInfoClassModel.insertMany(classData);
- }
- if (libData) {
- await priceInfoLibModel.insertMany([libData]);
- }
- if (compilationAreas) {
- await priceInfoAreaModel.insertMany(compilationAreas);
- }
- }
- /**
- * 爬取数据
- * @param {String} from - 从哪一期开始 eg: 2020-01
- * @param {String} to - 从哪一期结束 eg: 2020-05
- * @param {String} compilationID - 费用定额ID
- * @return {Object}
- */
- async function crawlData(from, to, compilationID) {
- let curPeriod;
- try {
- const $index = await loadPage(PageType.GENERAL);
- const periodData = getPeriodData(from, to, $index);
- if (!periodData) {
- throw '无效的期数区间。';
- }
- // 地区补丁
- const areaData = await priceInfoAreaModel.find({ compilationID, serialNo: null }).lean();
- const bulks = [];
- areaData.forEach(areaItem => {
- const serialNo = areas.indexOf(areaItem.name) + 1;
- bulks.push({
- updateOne: {
- filter: { ID: areaItem.ID },
- update: { serialNo }
- }
- });
- });
- if (bulks.length) {
- await priceInfoAreaModel.bulkWrite(bulks);
- }
- // 一期一期爬取数据
- debugConsole('allTime', 'time');
- for (const periodItem of periodData) {
- debugConsole('peroidTime', 'time');
- // 爬取主要材料信息价格
- const generalData = await crawlGeneralData(periodItem.uid, $index); // 初始页面就是主要材料信息价的页面
- // 爬取各区县地方材料工地价格
- const areaData = await crawlAreaData(periodItem.uid);
- // 爬取预拌砂浆信息价格
- const mixedData = await crawlMixedData(periodItem.uid);
- // 转换数据并入库
- await save(periodItem.period, generalData, areaData, mixedData);
- curPeriod = periodItem.period;
- debugConsole('peroidTime', 'timeEnd');
- }
- debugConsole('allTime', 'timeEnd');
- } catch (err) {
- console.log(err);
- // 错误时提示已经成功爬取的期数
- let errTip = '';
- if (curPeriod) {
- errTip += `\n成功爬取期数为:${from}到${curPeriod}`;
- }
- const errStr = String(err) + errTip;
- console.log(`err`);
- console.log(errStr);
- throw errStr;
- }
- }
|