chongqing_2018_price_crawler.js 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956
  1. /**
  2. * @author vian
  3. * 重庆材料信息价爬虫
  4. * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的数据是ssr的静态内容,因此不需要使用puppeteer。
  5. * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据)
  6. */
  7. module.exports = {
  8. crawlData,
  9. };
  10. const cheerio = require('cheerio');
  11. const axios = require('axios');
  12. const querystring = require('querystring');
  13. const uuidV1 = require('uuid/v1');
  14. const mongoose = require('mongoose');
  15. const { isDef } = require('../../../public/common_util');
  16. const { SSL_OP_SSLEAY_080_CLIENT_DH_BUG } = require('constants');
  17. const compilationModel = mongoose.model('compilation');
  18. const priceInfoLibModel = mongoose.model('std_price_info_lib');
  19. const priceInfoClassModel = mongoose.model('std_price_info_class');
  20. const priceInfoItemModel = mongoose.model('std_price_info_items');
  21. const priceInfoAreaModel = mongoose.model('std_price_info_areas');
  22. const isDebug = true;
  23. function debugConsole(str, type = 'log') {
  24. if (isDebug) {
  25. console[type](str);
  26. }
  27. }
  28. // 页面类型
  29. const PageType = {
  30. GENERAL: '/Index.aspx',
  31. AREA: '/AreaIndex.aspx',
  32. MIXED: '/ReadyMixedIndex.aspx',
  33. };
  34. /**
  35. * 获取主要材料信息价格页面表单数据
  36. * @param {Object} $ - 页面内容
  37. * @param {Object} props - 提交属性
  38. */
  39. function getGeneralDataBody($, props) {
  40. const body = {
  41. __EVENTTARGET: props.eventTarget || '',
  42. __EVENTARGUMENT: '',
  43. __VIEWSTATE: $('#__VIEWSTATE').val(),
  44. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  45. ID_ucPrice$linkvv: props.period,
  46. ID_ucPrice$linkcategory: props.materialClass || '',
  47. ID_ucPrice$LinkValue: `${props.classID},${props.period},${props.materialClass || ''}`,
  48. ID_ucPrice$txtsonclass: `sonclass${props.classID}`,
  49. ID_ucPrice$txtfatherclass: $('#ID_ucPrice_txtfatherclass').val(),
  50. ID_ucPrice$txtClassId: props.classID || '',
  51. ID_ucPrice$ddlSearchYear: '请选择',
  52. ID_ucPrice$ddlSearchMonth: '请选择',
  53. ID_ucPrice$txtSearchCailiao: '',
  54. ID_ucPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  55. };
  56. if (!props.eventTarget) {
  57. body.ID_ucPrice$btnLink = $('#ID_ucPrice_btnLink').val();
  58. }
  59. return body;
  60. }
  61. /**
  62. * 获取各区县地方材料工地价格页面表单数据
  63. * @param {Object} $ - 页面内容
  64. * @param {Object} props - 提交属性
  65. */
  66. function getAreaDataBody($, props) {
  67. if (!props || !Object.keys(props).length) {
  68. return {};
  69. }
  70. const body = {
  71. __EVENTTARGET: props.eventTarget || '',
  72. __EVENTARGUMENT: '',
  73. __VIEWSTATE: $('#__VIEWSTATE').val(),
  74. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  75. ID_ucAreaPrice$linkvv: props.period,
  76. ID_ucAreaPrice$LinkValue: '',
  77. ID_ucAreaPrice$dropArea: 'code',
  78. ID_ucAreaPrice$txtSearchCailiao: '',
  79. ID_ucAreaPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  80. };
  81. if (!props.eventTarget) {
  82. body.ID_ucAreaPrice$btnAreaMaster = 'Button';
  83. }
  84. return body;
  85. }
  86. /**
  87. * 获取预拌砂浆信息价格页面表单数据
  88. * @param {Object} $ - 页面内容
  89. * @param {Object} props - 提交属性
  90. */
  91. function getMixedDataBody($, props) {
  92. if (!props || !Object.keys(props).length) {
  93. return {};
  94. }
  95. const body = {
  96. __EVENTTARGET: props.eventTarget || '',
  97. __EVENTARGUMENT: '',
  98. __VIEWSTATE: $('#__VIEWSTATE').val(),
  99. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  100. ID_ucReadyMixedPrice$linkvv: props.period,
  101. ID_ucReadyMixedPrice$LinkValue: '',
  102. ID_ucReadyMixedPrice$dropArea: 'code',
  103. ID_ucReadyMixedPrice$txtSearchCailiao: '',
  104. ID_ucReadyMixedPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  105. };
  106. if (!props.eventTarget) {
  107. body.ID_ucReadyMixedPrice$btnAreaMaster = 'Button';
  108. }
  109. return body;
  110. }
  111. // 获取提交
  112. const TIME_OUT = 60000;
  113. // 创建axios实例
  114. const axiosInstance = axios.create({
  115. baseURL: 'http://www.cqsgczjxx.org/Jgxx/',
  116. timeout: TIME_OUT,
  117. /* proxy: {
  118. host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error
  119. }, */
  120. headers: {
  121. 'Cache-Control': 'max-age=0',
  122. 'Content-Type': 'application/x-www-form-urlencoded',
  123. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
  124. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  125. 'Accept-Encoding': 'gzip, deflate',
  126. 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
  127. },
  128. responseType: 'document'
  129. });
  130. // 响应拦截器
  131. axiosInstance.interceptors.response.use(function (response) {
  132. return response;
  133. }, function (error) {
  134. // 对响应错误做点什么
  135. if (error.message.includes('timeout')) {
  136. return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
  137. } else {
  138. return Promise.reject(error);
  139. }
  140. });
  141. // 发起请求需要携带Cookie,否则一些请求会返回500错误(应该是网站的反爬措施)
  142. let curCookie = '';
  143. /**
  144. * 加载页面,获取可用类jquery操作的数据
  145. * @param {String} url - 拼接的url
  146. * @param {Object} body - 表单数据
  147. * @return {DOM-LIKE} - cheerio解析html得到的类dom数据
  148. */
  149. async function loadPage(url, body) {
  150. const config = {};
  151. if (curCookie) {
  152. config.headers = { Cookie: curCookie };
  153. }
  154. const rst = body ?
  155. await axiosInstance.post(url, querystring.stringify(body), config) :
  156. await axiosInstance.post(url, null, config);
  157. // 更新cookie
  158. const cookies = rst.headers['set-cookie'];
  159. if (Object.prototype.toString.call(cookies) === '[object Array]') {
  160. curCookie = cookies[0].split(';')[0];
  161. }
  162. return cheerio.load(rst.data);
  163. }
  164. const monthMap = {
  165. '1': '01月',
  166. '2': '02月',
  167. '3': '03月',
  168. '4': '04月',
  169. '5': '05月',
  170. '6': '06月',
  171. '7': '07月',
  172. '8': '08月',
  173. '9': '09月',
  174. '10': '10月',
  175. '11': '11月',
  176. '12': '12月',
  177. };
  178. /**
  179. * 获取期数数据
  180. * @param {String} from - 从哪一期开始 eg: 2020-01
  181. * @param {String} to - 从哪一期结束 eg: 2020-05
  182. * @param {Object} $index - cheerio加载的初始页面内容
  183. * @return {Array<object> || Null} eg: {period: '2020-05', uid: 'XCCXXXXX-XX'}
  184. */
  185. function getPeriodData(from, to, $index) {
  186. if (from > to) {
  187. return null;
  188. }
  189. const $period = $index('#PriceLMenu')
  190. // 根据区间获取期数列表
  191. const reg = /(\d+)-(\d+)/;
  192. const fromMatch = from.match(reg);
  193. const fromYear = +fromMatch[1];
  194. const fromMonth = +fromMatch[2];
  195. const toMatch = to.match(reg);
  196. const toYear = +toMatch[1];
  197. const toMonth = +toMatch[2];
  198. let curYear = fromYear;
  199. let curMonth = fromMonth;
  200. const list = [];
  201. while (curYear <= toYear && curMonth <= toMonth) {
  202. const uid = getPeriodUID(curYear, curMonth, $period);
  203. // 存在无效期数,直接返回空
  204. if (!uid) {
  205. return null;
  206. }
  207. list.push({
  208. period: `${curYear}年-${monthMap[curMonth]}`,
  209. uid
  210. });
  211. if (curMonth === 12) {
  212. curYear++;
  213. curMonth = 1;
  214. } else {
  215. curMonth++;
  216. }
  217. }
  218. return list;
  219. function getPeriodUID(year, month, $period) {
  220. const $year = $period.find('.MenuOneTitle').filter(function () {
  221. return $index(this).text() === `${year}年`;
  222. });
  223. if (!$year.length) {
  224. return null;
  225. }
  226. const $month = $year.parent().next().find('a').filter(function () {
  227. return $index(this).text() === `${month}月`;
  228. });
  229. if (!$month.length) {
  230. return null;
  231. }
  232. // 期数uid在onclick中,需要提取出来
  233. const onclickText = $month.attr('onclick').toString();
  234. const reg = /Onlink\('([^']+)'/;
  235. const matched = onclickText.match(reg);
  236. if (!matched || !matched[1]) {
  237. return null;
  238. }
  239. return matched[1];
  240. }
  241. }
  242. // 表格类型
  243. const TableType = {
  244. BUILDING: 1, // 主要材料中的建安工程材料和绿色
  245. GARDEN: 2, // 主要材料中的园林绿化
  246. ENERGY: 3, // 主要材料中的节能建筑工程材料
  247. AREA: 4, // 地区相关(各区县材料)
  248. MIXED: 5, // 地区相关(预拌砂浆)
  249. };
  250. /**
  251. * 爬取表格数据
  252. * @param {Object} $page - 页面内容
  253. * @param {Number} type - 表格类型
  254. * @return {Array<object>}
  255. */
  256. function crawlTableData($page, type) {
  257. switch (type) {
  258. case TableType.BUILDING:
  259. case TableType.ENERGY:
  260. return crawlNormalTable($page);
  261. case TableType.GARDEN:
  262. return crawlGardenTable($page);
  263. case TableType.AREA:
  264. return crawlAreaTable($page, '#ID_ucAreaPrice_gridView');
  265. case TableType.MIXED:
  266. return crawlAreaTable($page, '#ID_ucReadyMixedPrice_gridView');
  267. }
  268. return [];
  269. }
  270. /**
  271. * 爬取表格数据,表格列为:
  272. * 序号 | 材料名称 | 规格型号 | 单位 | 含税价(元) | 不含税价(元) | 备注
  273. * @param {Object} $page - 页面内容
  274. * @return {Array<object>}
  275. */
  276. function crawlNormalTable($page) {
  277. const colMap = {
  278. 0: 'name',
  279. 1: 'specs',
  280. 2: 'unit',
  281. 3: 'taxPrice',
  282. 4: 'noTaxPrice',
  283. 5: 'remark'
  284. };
  285. const data = [];
  286. let cur;
  287. const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
  288. $tdList.each(function (index) {
  289. const col = index % 6;
  290. if (col === 0) {
  291. cur = {}
  292. }
  293. cur[colMap[col]] = $page(this).text();
  294. if (col === 5) {
  295. data.push(cur);
  296. }
  297. });
  298. debugConsole(data);
  299. return data;
  300. }
  301. /**
  302. * 爬取表格数据,表格列为:
  303. * 序号 | 科属 | 品名 | 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 单位 | 含税价(元) | 不含税价(元) | 备注
  304. * @param {Object} $page - 页面内容
  305. * @return {Array<object>}
  306. */
  307. function crawlGardenTable($page) {
  308. const colMap = {
  309. 0: 'genera',
  310. 1: 'name',
  311. 2: 'height',
  312. 3: 'branchDiameter',
  313. 4: 'crownDiameter',
  314. 5: 'branchHeight',
  315. 6: 'unit',
  316. 7: 'taxPrice',
  317. 8: 'noTaxPrice',
  318. 9: 'remark',
  319. };
  320. const data = [];
  321. let cur;
  322. const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 11 !== 0); // 排除表头和序号列
  323. $tdList.each(function (index) {
  324. const col = index % 10;
  325. if (col === 0) {
  326. cur = {}
  327. }
  328. cur[colMap[col]] = $page(this).text();
  329. if (col === 9) {
  330. data.push(cur);
  331. }
  332. });
  333. debugConsole(data);
  334. return data;
  335. }
  336. /**
  337. * 爬取表格数据,表格列为:
  338. * 序号 | 所属区县 | 材料名称 | 规格及型号 | 计量单位 | 含税价(元) | 不含税价(元)
  339. * @param {Object} $page - 页面内容
  340. * @param {String} viewSelector - 表格选择器(ID)
  341. * @return {Array<object>}
  342. */
  343. function crawlAreaTable($page, viewSelector) {
  344. const colMap = {
  345. 0: 'area',
  346. 1: 'name',
  347. 2: 'specs',
  348. 3: 'unit',
  349. 4: 'taxPrice',
  350. 5: 'noTaxPrice',
  351. };
  352. const data = [];
  353. let cur;
  354. const $tdList = $page(viewSelector).find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
  355. $tdList.each(function (index) {
  356. const col = index % 6;
  357. if (col === 0) {
  358. cur = {}
  359. }
  360. cur[colMap[col]] = $page(this).text();
  361. if (col === 5) {
  362. data.push(cur);
  363. }
  364. });
  365. debugConsole(data);
  366. return data;
  367. }
  368. // 事件触发类型
  369. const EventTarget = {
  370. GENERAL_NEXT: 'ID_ucPrice$UcPager1$btnNext',
  371. AREA_NEXT: 'ID_ucAreaPrice$UcPager1$btnNext',
  372. MIXED_NEXT: 'ID_ucReadyMixedPrice_UcPager1_btnNext',
  373. };
  374. /**
  375. * 爬取一页一页的表格数据
  376. * @param {Object} $index - 索引页面内容
  377. * @param {Object} props - 提交的表单内容
  378. * @param {String} pageType - 页面类型
  379. * @param {Number} tableType - 表格类型
  380. */
  381. async function crawlPagesData($index, props, pageType, tableType) {
  382. let body;
  383. let pageStateSelector;
  384. if (pageType === PageType.GENERAL) {
  385. body = getGeneralDataBody($index, props);
  386. pageStateSelector = '#ID_ucPrice_UcPager1_lbPage';
  387. } else if (pageType === PageType.AREA) {
  388. body = getAreaDataBody($index, props);
  389. pageStateSelector = '#ID_ucAreaPrice_UcPager1_lbPage';
  390. } else {
  391. body = getMixedDataBody($index, props);
  392. pageStateSelector = '#ID_ucReadyMixedPrice_UcPager1_lbPage';
  393. }
  394. const $firstPage = await loadPage(pageType, body);
  395. const rst = [];
  396. // 获取第一页数据
  397. rst.push(...crawlTableData($firstPage, tableType));
  398. if (!rst.length) { // 第一页都没数据,后续不需要操作了
  399. return rst;
  400. }
  401. // 获取除第一页的数据
  402. // 获取页码
  403. const pageState = $firstPage(pageStateSelector).text(); // eg: 1/10
  404. const totalPage = +pageState.split('/')[1];
  405. const asyncCount = 6; // 最高批量次数
  406. let curCount = 0;
  407. let task = [];
  408. for (let page = 1; page < totalPage; page++) {
  409. task.push(crawlPageData(page));
  410. curCount++;
  411. if (curCount === asyncCount) {
  412. const allData = await Promise.all(task);
  413. allData.forEach(data => rst.push(...data));
  414. curCount = 0;
  415. task = [];
  416. }
  417. }
  418. if (task.length) {
  419. const allData = await Promise.all(task);
  420. allData.forEach(data => rst.push(...data));
  421. }
  422. return rst;
  423. // 爬取页码数据
  424. async function crawlPageData(page) {
  425. const pageProps = { ...props, page };
  426. let body;
  427. if (pageType === PageType.GENERAL) {
  428. pageProps.eventTarget = EventTarget.GENERAL_NEXT;
  429. body = getGeneralDataBody($firstPage, pageProps);
  430. } else if (pageType === PageType.AREA) {
  431. pageProps.eventTarget = EventTarget.AREA_NEXT;
  432. body = getAreaDataBody($firstPage, pageProps);
  433. } else {
  434. pageProps.eventTarget = EventTarget.MIXED_NEXT;
  435. body = getMixedDataBody($firstPage, pageProps);
  436. }
  437. const $page = await loadPage(pageType, body);
  438. return crawlTableData($page, tableType);
  439. }
  440. }
  441. /**
  442. * 爬取建安工程材料和绿色、园林绿化工程材料、节能建筑工程材料
  443. * @param {String} period - 期数uid
  444. * @param {String} classID - 工程分类id
  445. * @param {Object} $index - 初始页面内容
  446. * @param {Number} type - 表格类型
  447. * @return {Array<object>} eg: [{ materialClass: '一、黑色及有色金属', items: [...] }]
  448. */
  449. async function crawlGeneralSubData(period, classID, $index, type) {
  450. const body = getGeneralDataBody($index, { period, classID });
  451. const $engineeringClassPage = await loadPage(PageType.GENERAL, body);
  452. const rst = [];
  453. if (type === TableType.BUILDING) {
  454. const classList = crawlMaterialClassList($index('#ID_ucPrice_CategoryLabel'));
  455. if (!classList.length) {
  456. throw '无法爬取到材料分类。';
  457. }
  458. const reg = /[一二三四五六七八九十]+、/;
  459. for (const materialClass of classList) {
  460. const obj = { materialClass: materialClass.replace(reg, ''), items: [] }; // 材料分类去除序号
  461. obj.items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass }, PageType.GENERAL, type);
  462. rst.push(obj);
  463. }
  464. } else {
  465. const items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass: '' }, PageType.GENERAL, type);
  466. rst.push(...items);
  467. }
  468. return rst;
  469. // 爬取材料分类表
  470. function crawlMaterialClassList($class) {
  471. const list = [];
  472. $class.find('a').each(function () {
  473. const text = $engineeringClassPage(this).text();
  474. list.push(text);
  475. });
  476. return list;
  477. }
  478. }
  479. /**
  480. * 爬取主要材料信息价格(这部分作为通用库)
  481. * @param {String} period - 期数uid
  482. * @param {Object} $index - 初始页面内容
  483. * @return {Object}
  484. */
  485. async function crawlGeneralData(period, $index) {
  486. const { building, garden, energy } = crawlClass($index('#ID_ucPrice_tabNewBar'));
  487. const rst = {};
  488. if (building) {
  489. rst.building = await crawlGeneralSubData(period, building, $index, TableType.BUILDING);
  490. }
  491. if (garden) {
  492. // 园林绿化工程材料下的数据所属分类为数据的"科属"列
  493. rst.garden = await crawlGeneralSubData(period, garden, $index, TableType.GARDEN);
  494. }
  495. if (energy) {
  496. // 绿色、节能建筑工程材料下的所有数据,所属分类均为“绿色、节能建筑工程材料”。
  497. rst.energy = await crawlGeneralSubData(period, energy, $index, TableType.ENERGY);
  498. }
  499. return rst;
  500. // 爬取工程分类
  501. function crawlClass($class) {
  502. // 工程分类
  503. let building; // 建安工程材料
  504. let garden; // 园林绿化工程材料
  505. let energy; // 绿色、节能建筑工程材料
  506. const reg = /OnClassson\('([^']+)'/;
  507. $class.find('a').each(function () {
  508. const text = $index(this).text();
  509. const onclickText = $index(this).attr('onclick').toString();
  510. const matched = onclickText.match(reg);
  511. if (!matched || !matched[1]) {
  512. throw '无法爬取到工程分类。';
  513. }
  514. if (text === '建安工程材料') {
  515. building = matched[1];
  516. } else if (text === '园林绿化工程材料') {
  517. garden = matched[1];
  518. } else if (text === '绿色、节能建筑工程材料') {
  519. energy = matched[1];
  520. }
  521. });
  522. return { building, garden, energy };
  523. }
  524. }
  525. /**
  526. * 爬取各区县地方材料工地价格
  527. * @param {String} period - 期数uid
  528. * @return {Array<object>}
  529. */
  530. async function crawlAreaData(period) {
  531. // 获取各区材料初始页
  532. const $index = await loadPage(PageType.AREA);
  533. // 获取地区材料
  534. return await crawlPagesData($index, { period }, PageType.AREA, TableType.AREA);
  535. }
  536. /**
  537. * 爬取预拌砂浆信息价格
  538. * @param {String} period - 期数uid
  539. * @return {Array<object>}
  540. */
  541. async function crawlMixedData(period) {
  542. // 获取各区材料初始页
  543. const $index = await loadPage(PageType.MIXED);
  544. // 获取地区材料
  545. return await crawlPagesData($index, { period }, PageType.MIXED, TableType.MIXED);
  546. }
  547. /**
  548. * 转换价格数据(一条源数据可能需要分割成多条数据)
  549. * @param {String} libID - 库ID
  550. * @param {String} classID - 所属分类ID
  551. * @param {String} period - 期数 eg:2020年01月
  552. * @param {String} areaID - 地区ID
  553. * @param {String} compilationID - 费用定额ID
  554. * @param {Array<object>} items - 爬取的信息价源数据
  555. * @param {Number} tableType - 表格类型
  556. * @return {Array<obejct>}
  557. */
  558. function transformPriceItems(libID, classID, period, areaID, compilationID, items, tableType) {
  559. const rst = [];
  560. if (tableType === TableType.GARDEN) {
  561. // 有的数据 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 不含税价(元) = ‘’ | 14-17 | 大于400 | 200-300 | 430-780
  562. // 则此数据需要分为:
  563. // 1. { name: 名称-最低价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 430 }
  564. // 2. { name: 名称-最高价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 780 }
  565. const unit = 'CM';
  566. const duplicateReg = /-/;
  567. items.forEach(item => {
  568. // 拼接规格型号
  569. const specsList = [];
  570. if (item.height) {
  571. specsList.push(`高度${item.height}${unit}`);
  572. }
  573. if (item.branchDiameter) {
  574. specsList.push(`干径${item.branchDiameter}${unit}`);
  575. }
  576. if (item.crownDiameter) {
  577. specsList.push(`冠径${item.crownDiameter}${unit}`);
  578. }
  579. if (item.branchHeight) {
  580. specsList.push(`分枝高${item.branchHeight}${unit}`);
  581. }
  582. const specs = specsList.join(' ');
  583. // 分成最高低价最高价数据
  584. const isDuplicate = duplicateReg.test(item.taxPrice) || duplicateReg.test(item.noTaxPrice);
  585. if (isDuplicate) {
  586. const taxPriceList = item.taxPrice.split('-');
  587. const noTaxPriceList = item.noTaxPrice.split('-');
  588. const minItem = {
  589. ...item,
  590. name: `${item.name}-最低价`,
  591. specs,
  592. taxPrice: taxPriceList[0],
  593. noTaxPrice: noTaxPriceList[0]
  594. };
  595. const maxItem = {
  596. ...item,
  597. name: `${item.name}-最高价`,
  598. specs,
  599. taxPrice: taxPriceList[1] || '',
  600. noTaxPrice: noTaxPriceList[1] || ''
  601. };
  602. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, minItem));
  603. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, maxItem));
  604. } else {
  605. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, item));
  606. }
  607. })
  608. } else {
  609. const duplicateReg = /\//;
  610. // 有的数据:规格型号 | 含税价(元) | 不含税价(元) = φ6(6.5)/φ8 HPB300 | 4030.00/3880.00 | 3566.37/3433.63,则这条数据需要分成两条数据
  611. items.forEach(item => {
  612. item.taxPrice = item.taxPrice === '-' ? '' : item.taxPrice;
  613. item.noTaxPrice = item.noTaxPrice === '-' ? '' : item.noTaxPrice;
  614. const isDuplicate = duplicateReg.test(item.taxPrice) || duplicateReg.test(item.noTaxPrice); // 以价格被分割,作为数据需要分割的判断
  615. if (isDuplicate) {
  616. // 提取规格型号分割部分和公共部分:Q390/Q420 δ=20-30 => Q390 δ=20-30; Q420 δ=20-30
  617. // 获取公共规格型号部分
  618. const commonReg = /\s+([^/]*)$/;
  619. const commonMatched = item.specs.match(commonReg);
  620. const commonSpecs = commonMatched && commonMatched[1] ? ' ' + commonMatched[1] : '';
  621. // 获取分割规格型号
  622. const specsList = item.specs
  623. .replace(commonReg, '')
  624. .split('/');
  625. const taxPriceList = item.taxPrice.split('/');
  626. const noTaxPriceList = item.noTaxPrice.split('/');
  627. specsList.forEach((specs, index) => {
  628. const newItem = {
  629. ...item,
  630. specs: `${specs}${commonSpecs}`,
  631. taxPrice: taxPriceList[index] || taxPriceList[0],
  632. noTaxPrice: noTaxPriceList[index] || noTaxPriceList[0]
  633. };
  634. if (areaID) {
  635. newItem.areaID = areaID;
  636. }
  637. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, newItem));
  638. });
  639. } else {
  640. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, item));
  641. }
  642. });
  643. }
  644. return rst;
  645. }
  646. // 转换单条的价格数据
  647. function transfromPriceItem(libID, classID, period, areaID, compilationID, item) {
  648. // 源数据中的规格型号存在多个无意义的空格,合并为一个
  649. const reg = /\s{2,}/g;
  650. item.specs = item.specs ? item.specs.replace(reg, ' ') : '';
  651. return {
  652. ID: uuidV1(),
  653. libID,
  654. classID,
  655. code: '',
  656. name: item.name,
  657. specs: item.specs,
  658. unit: item.unit,
  659. taxPrice: item.taxPrice,
  660. noTaxPrice: item.noTaxPrice,
  661. remark: item.remark || '',
  662. // 以下冗余数据为方便前台信息价功能处理
  663. period,
  664. areaID,
  665. compilationID,
  666. }
  667. }
  668. /**
  669. * 转换主要材料
  670. * @param {String} period - 日期: 2020年01月
  671. * @param {String} compilationID - 费用定额ID
  672. * @param {Object} generalData - 主要材料{ building, garden, energy }
  673. * @return {Object} { libData, classData, priceData, compilationAreas }
  674. */
  675. async function transfromGeneralData(period, compilationID, generalData) {
  676. const area = '通用';
  677. // 爬取数据的时候,地区数据先匹配名称,如果费用定额已有此地区,不新增
  678. const matchedArea = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
  679. const areaID = matchedArea && matchedArea.ID || uuidV1();
  680. const compilationAreas = [];
  681. const libData = {
  682. ID: uuidV1(),
  683. name: `信息价(${period})`,
  684. period,
  685. areas: [],
  686. compilationID,
  687. createDate: Date.now(),
  688. };
  689. const classData = [];
  690. let curClassIndex = 0;
  691. const priceData = [];
  692. const { building, garden, energy } = generalData;
  693. handleClassAndItems(building, TableType.BUILDING);
  694. // 园林分类数据为:苗木-科属(genera)
  695. const gardenRoot = { materialClass: '苗木', treeData: { ID: uuidV1(), ParentID: '-1' } };
  696. const gardenData = [gardenRoot];
  697. garden.forEach(item => {
  698. const pre = gardenData[gardenData.length - 1];
  699. if (item.genera !== pre.materialClass) {
  700. gardenData.push({ materialClass: item.genera, treeData: { ParentID: gardenRoot.treeData.ID }, items: [item] });
  701. } else {
  702. pre.items.push(item);
  703. }
  704. });
  705. handleClassAndItems(gardenData, TableType.GARDEN)
  706. // 绿色节能分类数据:绿色、节能建筑工程材料
  707. const energyData = [{ materialClass: '绿色、节能建筑工程材料', items: energy }];
  708. handleClassAndItems(energyData, TableType.ENERGY);
  709. // 有数据才将地区push入areas中(费用定额共用)
  710. if ((classData.length || priceData.length) && !matchedArea) {
  711. compilationAreas.push({ compilationID, ID: areaID, name: area })
  712. }
  713. return { libData, classData, priceData, compilationAreas };
  714. function handleClassAndItems(sourceData, tableType) {
  715. if (!sourceData) {
  716. return;
  717. }
  718. sourceData.forEach(({ materialClass, treeData, items }) => {
  719. const classItem = {
  720. ID: treeData && treeData.ID || uuidV1(),
  721. ParentID: treeData && treeData.ParentID || '-1',
  722. NextSiblingID: treeData && treeData.NextSiblingID || '-1',
  723. name: materialClass,
  724. libID: libData.ID,
  725. areaID,
  726. };
  727. // 设置上一个节点数据的NextID
  728. let count = 1;
  729. let pre = classData[curClassIndex - 1];
  730. while (pre && pre.ParentID !== classItem.ParentID) {
  731. count++;
  732. pre = classData[curClassIndex - count];
  733. }
  734. if (pre && pre.ParentID === classItem.ParentID) {
  735. pre.NextSiblingID = classItem.ID;
  736. }
  737. curClassIndex++;
  738. classData.push(classItem);
  739. // 转换价格数据
  740. if (items && items.length) {
  741. const newItems = transformPriceItems(libData.ID, classItem.ID, period, areaID, compilationID, items, tableType);
  742. newItems.forEach(item => priceData.push(item));
  743. }
  744. });
  745. }
  746. }
  747. /**
  748. * 转换跟地区相关的数据
  749. * 地区作为期数库的子项
  750. * @param {String} period - 日期: 2020年01月
  751. * @param {String} compilationID - 费用定额ID
  752. * @param {String} className - 分类名称
  753. * @param {Object} libData - 当前期数库数据
  754. * @param {Array<object>} areaData - 各区县地方材料工地价格
  755. * @param {Array<object>} mixedData - 预拌砂浆信息价格
  756. * @return {Object}
  757. */
  758. async function transformAreaData(period, compilationID, libData, areaData, mixedData) {
  759. // 根据地区进行分类
  760. const data = [];
  761. const hashMap = {}; // 保证地区顺序跟网页爬取数据的顺序一致。(object for in无法保证顺序)
  762. function hash(area) {
  763. if (!isDef(hashMap[area])) {
  764. hashMap[area] = Object.keys(hashMap).length
  765. }
  766. return hashMap[area];
  767. }
  768. const areaClass = '地方材料信息价';
  769. const mixedClass = '预拌商品砂浆';
  770. function buildData(sourceData) {
  771. sourceData.forEach(item => {
  772. const idx = hash(item.area);
  773. if (!data[idx]) {
  774. data[idx] = { area: item.area, subData: [] };
  775. }
  776. if (sourceData === areaData) {
  777. // 存在地区数据,需要生成分类“地方材料信息价”
  778. if (!data[idx].subData[0]) {
  779. data[idx].subData[0] = { className: areaClass, items: [] };
  780. }
  781. data[idx].subData[0].items.push(item);
  782. } else if (sourceData === mixedData) {
  783. // 存在地区数据,需要生成分类“地方材料信息价”
  784. if (!data[idx].subData[1]) {
  785. data[idx].subData[1] = { className: mixedClass, items: [] };
  786. }
  787. data[idx].subData[1].items.push(item);
  788. }
  789. });
  790. }
  791. buildData(areaData);
  792. buildData(mixedData);
  793. const compilationAreas = [];
  794. const classData = [];
  795. const priceData = [];
  796. for (const { area, subData } of data) {
  797. const matchedArea = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
  798. const areaID = matchedArea && matchedArea.ID || uuidV1();
  799. if (!matchedArea) {
  800. compilationAreas.push({ compilationID, ID: areaID, name: area });
  801. }
  802. let preClass;
  803. subData.forEach(subItem => {
  804. if (!subItem) {
  805. return;
  806. }
  807. const { className, items } = subItem;
  808. const classItem = {
  809. ID: uuidV1(),
  810. ParentID: '-1',
  811. NextSiblingID: '-1',
  812. name: className,
  813. libID: libData.ID,
  814. areaID,
  815. };
  816. classData.push(classItem);
  817. if (preClass) {
  818. preClass.NextSiblingID = classItem.ID;
  819. }
  820. preClass = classItem;
  821. const newItems = transformPriceItems(libData.ID, classItem.ID, period, areaID, compilationID, items, TableType.AREA);
  822. newItems.forEach(item => priceData.push(item));
  823. });
  824. }
  825. return { classData, priceData, compilationAreas };
  826. }
  827. /**
  828. * 数据入库
  829. * 生成一个通用库及各地区
  830. * @param {String} period 期数 eg: '2020年05月'
  831. * @param {Object} generalData - 主要材料{ building, garden, energy }
  832. * @param {Array<object>} areaData - 各地区材料
  833. * @param {Array<object>} mixedData - 各地区预拌砂浆
  834. */
  835. async function save(period, generalData, areaData, mixedData) {
  836. const overWriteUrl = '/web/over_write/js/chongqing_2018.js';
  837. const compilation = await compilationModel.findOne({ overWriteUrl }, '_id').lean();
  838. if (!compilation) {
  839. throw '没有找到正确配置overWriteUrl的费用定额。';
  840. }
  841. const compilationID = compilation._id;
  842. // 转换数据
  843. const generalSaveData = await transfromGeneralData(period, compilationID, generalData);
  844. const libData = generalSaveData.libData;
  845. const areaSaveData = await transformAreaData(period, compilationID, libData, areaData, mixedData);
  846. // 入库
  847. const classData = [...generalSaveData.classData, ...areaSaveData.classData];
  848. const priceData = [...generalSaveData.priceData, ...areaSaveData.priceData];
  849. const compilationAreas = [...generalSaveData.compilationAreas, ...areaSaveData.compilationAreas]
  850. // 删除已有的相同期数数据
  851. const originalLibs = await priceInfoLibModel.find({ period }, '-_id ID').lean();
  852. const originalLibIDList = originalLibs.reduce((acc, cur) => {
  853. acc.push(cur.ID);
  854. return acc;
  855. }, []);
  856. if (originalLibIDList.length) {
  857. await priceInfoItemModel.deleteMany({ period });
  858. await priceInfoClassModel.deleteMany({ libID: { $in: originalLibIDList } });
  859. await priceInfoLibModel.deleteMany({ period });
  860. }
  861. // 插入数据
  862. if (priceData.length) {
  863. await priceInfoItemModel.insertMany(priceData);
  864. }
  865. if (classData.length) {
  866. await priceInfoClassModel.insertMany(classData);
  867. }
  868. if (libData) {
  869. await priceInfoLibModel.insertMany([libData]);
  870. }
  871. if (compilationAreas) {
  872. await priceInfoAreaModel.insertMany(compilationAreas);
  873. }
  874. }
  875. /**
  876. * 爬取数据
  877. * @param {String} from - 从哪一期开始 eg: 2020-01
  878. * @param {String} to - 从哪一期结束 eg: 2020-05
  879. * @return {Object}
  880. */
  881. async function crawlData(from, to) {
  882. let curPeriod;
  883. try {
  884. const $index = await loadPage(PageType.GENERAL);
  885. const periodData = getPeriodData(from, to, $index);
  886. if (!periodData) {
  887. throw '无效的期数区间。';
  888. }
  889. // 一期一期爬取数据
  890. debugConsole('allTime', 'time');
  891. for (const periodItem of periodData) {
  892. debugConsole('peroidTime', 'time');
  893. // 爬取主要材料信息价格
  894. const generalData = await crawlGeneralData(periodItem.uid, $index); // 初始页面就是主要材料信息价的页面
  895. // 爬取各区县地方材料工地价格
  896. const areaData = await crawlAreaData(periodItem.uid);
  897. // 爬取预拌砂浆信息价格
  898. const mixedData = await crawlMixedData(periodItem.uid);
  899. // 转换数据并入库
  900. await save(periodItem.period, generalData, areaData, mixedData);
  901. curPeriod = periodItem.period;
  902. debugConsole('peroidTime', 'timeEnd');
  903. }
  904. debugConsole('allTime', 'timeEnd');
  905. } catch (err) {
  906. console.log(err);
  907. // 错误时提示已经成功爬取的期数
  908. let errTip = '';
  909. if (curPeriod) {
  910. errTip += `\n成功爬取期数为:${from}到${curPeriod}`;
  911. }
  912. const errStr = String(err) + errTip;
  913. console.log(`err`);
  914. console.log(errStr);
  915. throw errStr;
  916. }
  917. }