chongqing_2018_price_crawler.js 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044
  1. /**
  2. * @author vian
  3. * 重庆材料信息价爬虫
  4. * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的数据是ssr的静态内容,因此不需要使用puppeteer。
  5. * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据)
  6. */
  7. module.exports = {
  8. crawlData,
  9. };
  10. const cheerio = require('cheerio');
  11. const axios = require('axios');
  12. const querystring = require('querystring');
  13. const uuidV1 = require('uuid/v1');
  14. const mongoose = require('mongoose');
  15. const { isDef } = require('../../../public/common_util');
  16. const compilationModel = mongoose.model('compilation');
  17. const priceInfoLibModel = mongoose.model('std_price_info_lib');
  18. const priceInfoClassModel = mongoose.model('std_price_info_class');
  19. const priceInfoItemModel = mongoose.model('std_price_info_items');
  20. const priceInfoAreaModel = mongoose.model('std_price_info_areas');
  21. const isDebug = true;
  22. function debugConsole(str, type = 'log') {
  23. if (isDebug) {
  24. console[type](str);
  25. }
  26. }
  27. const areas = [
  28. '主城区',
  29. '渝中区',
  30. '江北区',
  31. '沙坪坝区',
  32. '南岸区',
  33. '九龙坡区',
  34. '大渡口区',
  35. '北碚区',
  36. '渝北区',
  37. '巴南区',
  38. '万州区',
  39. '涪陵区',
  40. '万盛区',
  41. '双桥区',
  42. '黔江区',
  43. '长寿区',
  44. '江津区',
  45. '合川区',
  46. '永川区',
  47. '南川区',
  48. '綦江县',
  49. '潼南县',
  50. '铜梁县',
  51. '大足县',
  52. '荣昌县',
  53. '璧山县',
  54. '梁平区',
  55. '城口县',
  56. '丰都县',
  57. '垫江县',
  58. '忠县',
  59. '开州区',
  60. '云阳县',
  61. '奉节县',
  62. '巫山县',
  63. '巫溪县',
  64. '石柱县',
  65. '秀山县',
  66. '酉阳县',
  67. '彭水县',
  68. '大足区',
  69. '綦江区',
  70. '万盛经开区',
  71. '双桥经开区',
  72. '铜梁区',
  73. '璧山区',
  74. '荣昌县1',
  75. '荣昌县2',
  76. '彭水县1',
  77. '彭水县2',
  78. '彭水县3',
  79. '潼南区',
  80. '荣昌区1',
  81. '荣昌区2',
  82. '武隆区',
  83. '武隆区1',
  84. '武隆区2',
  85. '武隆区3',
  86. '武隆区4',
  87. '武隆区5',
  88. '武隆区6',
  89. ];
  90. // 页面类型
  91. const PageType = {
  92. GENERAL: '/Index.aspx',
  93. AREA: '/AreaIndex.aspx',
  94. MIXED: '/ReadyMixedIndex.aspx',
  95. };
  96. /**
  97. * 获取主要材料信息价格页面表单数据
  98. * @param {Object} $ - 页面内容
  99. * @param {Object} props - 提交属性
  100. */
  101. function getGeneralDataBody($, props) {
  102. const body = {
  103. __EVENTTARGET: props.eventTarget || '',
  104. __EVENTARGUMENT: '',
  105. __VIEWSTATE: $('#__VIEWSTATE').val(),
  106. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  107. ID_ucPrice$linkvv: props.period,
  108. ID_ucPrice$linkcategory: props.materialClass || '',
  109. ID_ucPrice$LinkValue: `${props.classID},${props.period},${props.materialClass || ''}`,
  110. ID_ucPrice$txtsonclass: `sonclass${props.classID}`,
  111. ID_ucPrice$txtfatherclass: $('#ID_ucPrice_txtfatherclass').val(),
  112. ID_ucPrice$txtClassId: props.classID || '',
  113. ID_ucPrice$ddlSearchYear: '请选择',
  114. ID_ucPrice$ddlSearchMonth: '请选择',
  115. ID_ucPrice$txtSearchCailiao: '',
  116. ID_ucPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  117. };
  118. if (!props.eventTarget) {
  119. body.ID_ucPrice$btnLink = $('#ID_ucPrice_btnLink').val();
  120. }
  121. return body;
  122. }
  123. /**
  124. * 获取各区县地方材料工地价格页面表单数据
  125. * @param {Object} $ - 页面内容
  126. * @param {Object} props - 提交属性
  127. */
  128. function getAreaDataBody($, props) {
  129. if (!props || !Object.keys(props).length) {
  130. return {};
  131. }
  132. const body = {
  133. __EVENTTARGET: props.eventTarget || '',
  134. __EVENTARGUMENT: '',
  135. __VIEWSTATE: $('#__VIEWSTATE').val(),
  136. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  137. ID_ucAreaPrice$linkvv: props.period,
  138. ID_ucAreaPrice$LinkValue: '',
  139. ID_ucAreaPrice$dropArea: 'code',
  140. ID_ucAreaPrice$txtSearchCailiao: '',
  141. ID_ucAreaPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  142. };
  143. if (!props.eventTarget) {
  144. body.ID_ucAreaPrice$btnAreaMaster = 'Button';
  145. }
  146. return body;
  147. }
  148. /**
  149. * 获取预拌砂浆信息价格页面表单数据
  150. * @param {Object} $ - 页面内容
  151. * @param {Object} props - 提交属性
  152. */
  153. function getMixedDataBody($, props) {
  154. if (!props || !Object.keys(props).length) {
  155. return {};
  156. }
  157. const body = {
  158. __EVENTTARGET: props.eventTarget || '',
  159. __EVENTARGUMENT: '',
  160. __VIEWSTATE: $('#__VIEWSTATE').val(),
  161. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  162. ID_ucReadyMixedPrice$linkvv: props.period,
  163. ID_ucReadyMixedPrice$LinkValue: '',
  164. ID_ucReadyMixedPrice$dropArea: 'code',
  165. ID_ucReadyMixedPrice$txtSearchCailiao: '',
  166. ID_ucReadyMixedPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  167. };
  168. if (!props.eventTarget) {
  169. body.ID_ucReadyMixedPrice$btnAreaMaster = 'Button';
  170. }
  171. return body;
  172. }
  173. // 获取提交
  174. const TIME_OUT = 60000;
  175. // 创建axios实例
  176. const axiosInstance = axios.create({
  177. baseURL: 'http://www.cqsgczjxx.org/Jgxx/',
  178. timeout: TIME_OUT,
  179. /* proxy: {
  180. host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error
  181. }, */
  182. headers: {
  183. 'Cache-Control': 'max-age=0',
  184. 'Content-Type': 'application/x-www-form-urlencoded',
  185. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
  186. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  187. 'Accept-Encoding': 'gzip, deflate',
  188. 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
  189. },
  190. responseType: 'document'
  191. });
  192. // 响应拦截器
  193. axiosInstance.interceptors.response.use(function (response) {
  194. return response;
  195. }, function (error) {
  196. // 对响应错误做点什么
  197. if (error.message.includes('timeout')) {
  198. return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
  199. } else {
  200. return Promise.reject(error);
  201. }
  202. });
  203. // 发起请求需要携带Cookie,否则一些请求会返回500错误(应该是网站的反爬措施)
  204. let curCookie = '';
  205. /**
  206. * 加载页面,获取可用类jquery操作的数据
  207. * @param {String} url - 拼接的url
  208. * @param {Object} body - 表单数据
  209. * @return {DOM-LIKE} - cheerio解析html得到的类dom数据
  210. */
  211. async function loadPage(url, body) {
  212. const config = {};
  213. if (curCookie) {
  214. config.headers = { Cookie: curCookie };
  215. }
  216. const rst = body ?
  217. await axiosInstance.post(url, querystring.stringify(body), config) :
  218. await axiosInstance.post(url, null, config);
  219. // 更新cookie
  220. const cookies = rst.headers['set-cookie'];
  221. if (Object.prototype.toString.call(cookies) === '[object Array]') {
  222. curCookie = cookies[0].split(';')[0];
  223. }
  224. return cheerio.load(rst.data);
  225. }
  226. const monthMap = {
  227. '1': '01月',
  228. '2': '02月',
  229. '3': '03月',
  230. '4': '04月',
  231. '5': '05月',
  232. '6': '06月',
  233. '7': '07月',
  234. '8': '08月',
  235. '9': '09月',
  236. '10': '10月',
  237. '11': '11月',
  238. '12': '12月',
  239. };
  240. /**
  241. * 获取期数数据
  242. * @param {String} from - 从哪一期开始 eg: 2020-01
  243. * @param {String} to - 从哪一期结束 eg: 2020-05
  244. * @param {Object} $index - cheerio加载的初始页面内容
  245. * @return {Array<object> || Null} eg: {period: '2020-05', uid: 'XCCXXXXX-XX'}
  246. */
  247. function getPeriodData(from, to, $index) {
  248. if (from > to) {
  249. return null;
  250. }
  251. const $period = $index('#PriceLMenu')
  252. // 根据区间获取期数列表
  253. const reg = /(\d+)-(\d+)/;
  254. const fromMatch = from.match(reg);
  255. const fromYear = +fromMatch[1];
  256. const fromMonth = +fromMatch[2];
  257. const toMatch = to.match(reg);
  258. const toYear = +toMatch[1];
  259. const toMonth = +toMatch[2];
  260. let curYear = fromYear;
  261. let curMonth = fromMonth;
  262. const list = [];
  263. while (curYear <= toYear && curMonth <= toMonth) {
  264. const uid = getPeriodUID(curYear, curMonth, $period);
  265. // 存在无效期数,直接返回空
  266. if (!uid) {
  267. return null;
  268. }
  269. list.push({
  270. period: `${curYear}年-${monthMap[curMonth]}`,
  271. uid
  272. });
  273. if (curMonth === 12) {
  274. curYear++;
  275. curMonth = 1;
  276. } else {
  277. curMonth++;
  278. }
  279. }
  280. return list;
  281. function getPeriodUID(year, month, $period) {
  282. const $year = $period.find('.MenuOneTitle').filter(function () {
  283. return $index(this).text() === `${year}年`;
  284. });
  285. if (!$year.length) {
  286. return null;
  287. }
  288. const $month = $year.parent().next().find('a').filter(function () {
  289. return $index(this).text() === `${month}月`;
  290. });
  291. if (!$month.length) {
  292. return null;
  293. }
  294. // 期数uid在onclick中,需要提取出来
  295. const onclickText = $month.attr('onclick').toString();
  296. const reg = /Onlink\('([^']+)'/;
  297. const matched = onclickText.match(reg);
  298. if (!matched || !matched[1]) {
  299. return null;
  300. }
  301. return matched[1];
  302. }
  303. }
  304. // 表格类型
  305. const TableType = {
  306. BUILDING: 1, // 主要材料中的建安工程材料和绿色
  307. GARDEN: 2, // 主要材料中的园林绿化
  308. ENERGY: 3, // 主要材料中的节能建筑工程材料
  309. AREA: 4, // 地区相关(各区县材料)
  310. MIXED: 5, // 地区相关(预拌砂浆)
  311. };
  312. /**
  313. * 爬取表格数据
  314. * @param {Object} $page - 页面内容
  315. * @param {Number} type - 表格类型
  316. * @return {Array<object>}
  317. */
  318. function crawlTableData($page, type) {
  319. switch (type) {
  320. case TableType.BUILDING:
  321. case TableType.ENERGY:
  322. return crawlNormalTable($page);
  323. case TableType.GARDEN:
  324. return crawlGardenTable($page);
  325. case TableType.AREA:
  326. return crawlAreaTable($page, '#ID_ucAreaPrice_gridView');
  327. case TableType.MIXED:
  328. return crawlAreaTable($page, '#ID_ucReadyMixedPrice_gridView');
  329. }
  330. return [];
  331. }
  332. /**
  333. * 爬取表格数据,表格列为:
  334. * 序号 | 材料名称 | 规格型号 | 单位 | 含税价(元) | 不含税价(元) | 备注
  335. * @param {Object} $page - 页面内容
  336. * @return {Array<object>}
  337. */
  338. function crawlNormalTable($page) {
  339. const colMap = {
  340. 0: 'name',
  341. 1: 'specs',
  342. 2: 'unit',
  343. 3: 'taxPrice',
  344. 4: 'noTaxPrice',
  345. 5: 'remark'
  346. };
  347. const data = [];
  348. let cur;
  349. const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
  350. $tdList.each(function (index) {
  351. const col = index % 6;
  352. if (col === 0) {
  353. cur = {}
  354. }
  355. cur[colMap[col]] = $page(this).text();
  356. if (col === 5) {
  357. data.push(cur);
  358. }
  359. });
  360. debugConsole(data);
  361. return data;
  362. }
  363. /**
  364. * 爬取表格数据,表格列为:
  365. * 序号 | 科属 | 品名 | 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 单位 | 含税价(元) | 不含税价(元) | 备注
  366. * @param {Object} $page - 页面内容
  367. * @return {Array<object>}
  368. */
  369. function crawlGardenTable($page) {
  370. const colMap = {
  371. 0: 'genera',
  372. 1: 'name',
  373. 2: 'height',
  374. 3: 'branchDiameter',
  375. 4: 'crownDiameter',
  376. 5: 'branchHeight',
  377. 6: 'unit',
  378. 7: 'taxPrice',
  379. 8: 'noTaxPrice',
  380. 9: 'remark',
  381. };
  382. const data = [];
  383. let cur;
  384. const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 11 !== 0); // 排除表头和序号列
  385. $tdList.each(function (index) {
  386. const col = index % 10;
  387. if (col === 0) {
  388. cur = {}
  389. }
  390. cur[colMap[col]] = $page(this).text();
  391. if (col === 9) {
  392. data.push(cur);
  393. }
  394. });
  395. debugConsole(data);
  396. return data;
  397. }
  398. /**
  399. * 爬取表格数据,表格列为:
  400. * 序号 | 所属区县 | 材料名称 | 规格及型号 | 计量单位 | 含税价(元) | 不含税价(元)
  401. * @param {Object} $page - 页面内容
  402. * @param {String} viewSelector - 表格选择器(ID)
  403. * @return {Array<object>}
  404. */
  405. function crawlAreaTable($page, viewSelector) {
  406. const colMap = {
  407. 0: 'area',
  408. 1: 'name',
  409. 2: 'specs',
  410. 3: 'unit',
  411. 4: 'taxPrice',
  412. 5: 'noTaxPrice',
  413. };
  414. const data = [];
  415. let cur;
  416. const $tdList = $page(viewSelector).find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
  417. $tdList.each(function (index) {
  418. const col = index % 6;
  419. if (col === 0) {
  420. cur = {}
  421. }
  422. cur[colMap[col]] = $page(this).text();
  423. if (col === 5) {
  424. data.push(cur);
  425. }
  426. });
  427. debugConsole(data);
  428. return data;
  429. }
  430. // 事件触发类型
  431. const EventTarget = {
  432. GENERAL_NEXT: 'ID_ucPrice$UcPager1$btnNext',
  433. AREA_NEXT: 'ID_ucAreaPrice$UcPager1$btnNext',
  434. MIXED_NEXT: 'ID_ucReadyMixedPrice_UcPager1_btnNext',
  435. };
  436. /**
  437. * 爬取一页一页的表格数据
  438. * @param {Object} $index - 索引页面内容
  439. * @param {Object} props - 提交的表单内容
  440. * @param {String} pageType - 页面类型
  441. * @param {Number} tableType - 表格类型
  442. */
  443. async function crawlPagesData($index, props, pageType, tableType) {
  444. let body;
  445. let pageStateSelector;
  446. if (pageType === PageType.GENERAL) {
  447. body = getGeneralDataBody($index, props);
  448. pageStateSelector = '#ID_ucPrice_UcPager1_lbPage';
  449. } else if (pageType === PageType.AREA) {
  450. body = getAreaDataBody($index, props);
  451. pageStateSelector = '#ID_ucAreaPrice_UcPager1_lbPage';
  452. } else {
  453. body = getMixedDataBody($index, props);
  454. pageStateSelector = '#ID_ucReadyMixedPrice_UcPager1_lbPage';
  455. }
  456. const $firstPage = await loadPage(pageType, body);
  457. const rst = [];
  458. // 获取第一页数据
  459. rst.push(...crawlTableData($firstPage, tableType));
  460. if (!rst.length) { // 第一页都没数据,后续不需要操作了
  461. return rst;
  462. }
  463. // 获取除第一页的数据
  464. // 获取页码
  465. const pageState = $firstPage(pageStateSelector).text(); // eg: 1/10
  466. const totalPage = +pageState.split('/')[1];
  467. const asyncCount = 6; // 最高批量次数
  468. let curCount = 0;
  469. let task = [];
  470. for (let page = 1; page < totalPage; page++) {
  471. task.push(crawlPageData(page));
  472. curCount++;
  473. if (curCount === asyncCount) {
  474. const allData = await Promise.all(task);
  475. allData.forEach(data => rst.push(...data));
  476. curCount = 0;
  477. task = [];
  478. }
  479. }
  480. if (task.length) {
  481. const allData = await Promise.all(task);
  482. allData.forEach(data => rst.push(...data));
  483. }
  484. return rst;
  485. // 爬取页码数据
  486. async function crawlPageData(page) {
  487. const pageProps = { ...props, page };
  488. let body;
  489. if (pageType === PageType.GENERAL) {
  490. pageProps.eventTarget = EventTarget.GENERAL_NEXT;
  491. body = getGeneralDataBody($firstPage, pageProps);
  492. } else if (pageType === PageType.AREA) {
  493. pageProps.eventTarget = EventTarget.AREA_NEXT;
  494. body = getAreaDataBody($firstPage, pageProps);
  495. } else {
  496. pageProps.eventTarget = EventTarget.MIXED_NEXT;
  497. body = getMixedDataBody($firstPage, pageProps);
  498. }
  499. const $page = await loadPage(pageType, body);
  500. return crawlTableData($page, tableType);
  501. }
  502. }
  503. /**
  504. * 爬取建安工程材料和绿色、园林绿化工程材料、节能建筑工程材料
  505. * @param {String} period - 期数uid
  506. * @param {String} classID - 工程分类id
  507. * @param {Object} $index - 初始页面内容
  508. * @param {Number} type - 表格类型
  509. * @return {Array<object>} eg: [{ materialClass: '一、黑色及有色金属', items: [...] }]
  510. */
  511. async function crawlGeneralSubData(period, classID, $index, type) {
  512. const body = getGeneralDataBody($index, { period, classID });
  513. const $engineeringClassPage = await loadPage(PageType.GENERAL, body);
  514. const rst = [];
  515. if (type === TableType.BUILDING) {
  516. const classList = crawlMaterialClassList($index('#ID_ucPrice_CategoryLabel'));
  517. if (!classList.length) {
  518. throw '无法爬取到材料分类。';
  519. }
  520. const reg = /[一二三四五六七八九十]+、/;
  521. for (const materialClass of classList) {
  522. const obj = { materialClass: materialClass.replace(reg, ''), items: [] }; // 材料分类去除序号
  523. obj.items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass }, PageType.GENERAL, type);
  524. rst.push(obj);
  525. }
  526. } else {
  527. const items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass: '' }, PageType.GENERAL, type);
  528. rst.push(...items);
  529. }
  530. return rst;
  531. // 爬取材料分类表
  532. function crawlMaterialClassList($class) {
  533. const list = [];
  534. $class.find('a').each(function () {
  535. const text = $engineeringClassPage(this).text();
  536. list.push(text);
  537. });
  538. return list;
  539. }
  540. }
  541. /**
  542. * 爬取主要材料信息价格(这部分作为通用库)
  543. * @param {String} period - 期数uid
  544. * @param {Object} $index - 初始页面内容
  545. * @return {Object}
  546. */
  547. async function crawlGeneralData(period, $index) {
  548. const { building, garden, energy } = crawlClass($index('#ID_ucPrice_tabNewBar'));
  549. const rst = {};
  550. if (building) {
  551. rst.building = await crawlGeneralSubData(period, building, $index, TableType.BUILDING);
  552. }
  553. if (garden) {
  554. // 园林绿化工程材料下的数据所属分类为数据的"科属"列
  555. rst.garden = await crawlGeneralSubData(period, garden, $index, TableType.GARDEN);
  556. }
  557. if (energy) {
  558. // 绿色、节能建筑工程材料下的所有数据,所属分类均为“绿色、节能建筑工程材料”。
  559. rst.energy = await crawlGeneralSubData(period, energy, $index, TableType.ENERGY);
  560. }
  561. return rst;
  562. // 爬取工程分类
  563. function crawlClass($class) {
  564. // 工程分类
  565. let building; // 建安工程材料
  566. let garden; // 园林绿化工程材料
  567. let energy; // 绿色、节能建筑工程材料
  568. const reg = /OnClassson\('([^']+)'/;
  569. $class.find('a').each(function () {
  570. const text = $index(this).text();
  571. const onclickText = $index(this).attr('onclick').toString();
  572. const matched = onclickText.match(reg);
  573. if (!matched || !matched[1]) {
  574. throw '无法爬取到工程分类。';
  575. }
  576. if (text === '建安工程材料') {
  577. building = matched[1];
  578. } else if (text === '园林绿化工程材料') {
  579. garden = matched[1];
  580. } else if (text === '绿色、节能建筑工程材料') {
  581. energy = matched[1];
  582. }
  583. });
  584. return { building, garden, energy };
  585. }
  586. }
  587. /**
  588. * 爬取各区县地方材料工地价格
  589. * @param {String} period - 期数uid
  590. * @return {Array<object>}
  591. */
  592. async function crawlAreaData(period) {
  593. // 获取各区材料初始页
  594. const $index = await loadPage(PageType.AREA);
  595. // 获取地区材料
  596. return await crawlPagesData($index, { period }, PageType.AREA, TableType.AREA);
  597. }
  598. /**
  599. * 爬取预拌砂浆信息价格
  600. * @param {String} period - 期数uid
  601. * @return {Array<object>}
  602. */
  603. async function crawlMixedData(period) {
  604. // 获取各区材料初始页
  605. const $index = await loadPage(PageType.MIXED);
  606. // 获取地区材料
  607. return await crawlPagesData($index, { period }, PageType.MIXED, TableType.MIXED);
  608. }
  609. /**
  610. * 转换价格数据(一条源数据可能需要分割成多条数据)
  611. * @param {String} libID - 库ID
  612. * @param {String} classID - 所属分类ID
  613. * @param {String} period - 期数 eg:2020年01月
  614. * @param {String} areaID - 地区ID
  615. * @param {String} compilationID - 费用定额ID
  616. * @param {Array<object>} items - 爬取的信息价源数据
  617. * @param {Number} tableType - 表格类型
  618. * @return {Array<obejct>}
  619. */
  620. function transformPriceItems(libID, classID, period, areaID, compilationID, items, tableType) {
  621. const rst = [];
  622. if (tableType === TableType.GARDEN) {
  623. // 有的数据 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 不含税价(元) = ‘’ | 14-17 | 大于400 | 200-300 | 430-780
  624. // 则此数据需要分为:
  625. // 1. { name: 名称-最低价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 430 }
  626. // 2. { name: 名称-最高价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 780 }
  627. const unit = 'CM';
  628. const duplicateReg = /-/;
  629. items.forEach(item => {
  630. // 拼接规格型号
  631. const specsList = [];
  632. if (item.height) {
  633. specsList.push(`高度${item.height}${unit}`);
  634. }
  635. if (item.branchDiameter) {
  636. specsList.push(`干径${item.branchDiameter}${unit}`);
  637. }
  638. if (item.crownDiameter) {
  639. specsList.push(`冠径${item.crownDiameter}${unit}`);
  640. }
  641. if (item.branchHeight) {
  642. specsList.push(`分枝高${item.branchHeight}${unit}`);
  643. }
  644. const specs = specsList.join(' ');
  645. // 分成最高低价最高价数据
  646. const isDuplicate = duplicateReg.test(item.taxPrice) || duplicateReg.test(item.noTaxPrice);
  647. if (isDuplicate) {
  648. const taxPriceList = item.taxPrice.split('-');
  649. const noTaxPriceList = item.noTaxPrice.split('-');
  650. const minItem = {
  651. ...item,
  652. name: `${item.name}-最低价`,
  653. specs,
  654. taxPrice: taxPriceList[0],
  655. noTaxPrice: noTaxPriceList[0]
  656. };
  657. const maxItem = {
  658. ...item,
  659. name: `${item.name}-最高价`,
  660. specs,
  661. taxPrice: taxPriceList[1] || '',
  662. noTaxPrice: noTaxPriceList[1] || ''
  663. };
  664. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, minItem));
  665. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, maxItem));
  666. } else {
  667. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, item));
  668. }
  669. })
  670. } else {
  671. const duplicateReg = /\//;
  672. // 有的数据:规格型号 | 含税价(元) | 不含税价(元) = φ6(6.5)/φ8 HPB300 | 4030.00/3880.00 | 3566.37/3433.63,则这条数据需要分成两条数据
  673. items.forEach(item => {
  674. item.taxPrice = item.taxPrice === '-' ? '' : item.taxPrice;
  675. item.noTaxPrice = item.noTaxPrice === '-' ? '' : item.noTaxPrice;
  676. const isDuplicate = duplicateReg.test(item.taxPrice) || duplicateReg.test(item.noTaxPrice); // 以价格被分割,作为数据需要分割的判断
  677. if (isDuplicate) {
  678. // 提取规格型号分割部分和公共部分:Q390/Q420 δ=20-30 => Q390 δ=20-30; Q420 δ=20-30
  679. // 获取公共规格型号部分
  680. const commonReg = /\s+([^/]*)$/;
  681. const commonMatched = item.specs.match(commonReg);
  682. const commonSpecs = commonMatched && commonMatched[1] ? ' ' + commonMatched[1] : '';
  683. // 获取分割规格型号
  684. const specsList = item.specs
  685. .replace(commonReg, '')
  686. .split('/');
  687. const taxPriceList = item.taxPrice.split('/');
  688. const noTaxPriceList = item.noTaxPrice.split('/');
  689. specsList.forEach((specs, index) => {
  690. const newItem = {
  691. ...item,
  692. specs: `${specs}${commonSpecs}`,
  693. taxPrice: taxPriceList[index] || taxPriceList[0],
  694. noTaxPrice: noTaxPriceList[index] || noTaxPriceList[0]
  695. };
  696. if (areaID) {
  697. newItem.areaID = areaID;
  698. }
  699. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, newItem));
  700. });
  701. } else {
  702. rst.push(transfromPriceItem(libID, classID, period, areaID, compilationID, item));
  703. }
  704. });
  705. }
  706. return rst;
  707. }
  708. // 转换单条的价格数据
  709. function transfromPriceItem(libID, classID, period, areaID, compilationID, item) {
  710. // 源数据中的规格型号存在多个无意义的空格,合并为一个
  711. const reg = /\s{2,}/g;
  712. item.specs = item.specs ? item.specs.replace(reg, ' ') : '';
  713. return {
  714. ID: uuidV1(),
  715. libID,
  716. classID,
  717. code: '',
  718. name: item.name,
  719. specs: item.specs,
  720. unit: item.unit,
  721. taxPrice: item.taxPrice,
  722. noTaxPrice: item.noTaxPrice,
  723. remark: item.remark || '',
  724. // 以下冗余数据为方便前台信息价功能处理
  725. period,
  726. areaID,
  727. compilationID,
  728. }
  729. }
  730. /**
  731. * 转换主要材料
  732. * @param {String} period - 日期: 2020年01月
  733. * @param {String} compilationID - 费用定额ID
  734. * @param {Object} generalData - 主要材料{ building, garden, energy }
  735. * @return {Object} { libData, classData, priceData, compilationAreas }
  736. */
  737. async function transfromGeneralData(period, compilationID, generalData) {
  738. const area = '通用';
  739. // 爬取数据的时候,地区数据先匹配名称,如果费用定额已有此地区,不新增
  740. const matchedArea = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
  741. // 地区需要serialNo字段,打补丁
  742. if (matchedArea && !matchedArea.serialNo) {
  743. await priceInfoAreaModel.update({ ID: matchedArea.ID }, { $set: { serialNo: 1 } });
  744. }
  745. const areaID = matchedArea && matchedArea.ID || uuidV1();
  746. const compilationAreas = [];
  747. const libData = {
  748. ID: uuidV1(),
  749. name: `信息价(${period})`,
  750. period,
  751. areas: [],
  752. compilationID,
  753. createDate: Date.now(),
  754. };
  755. const classData = [];
  756. let curClassIndex = 0;
  757. const priceData = [];
  758. const { building, garden, energy } = generalData;
  759. handleClassAndItems(building, TableType.BUILDING);
  760. // 园林分类数据为:苗木-科属(genera)
  761. const gardenRoot = { materialClass: '苗木', treeData: { ID: uuidV1(), ParentID: '-1' } };
  762. const gardenData = [gardenRoot];
  763. garden.forEach(item => {
  764. const pre = gardenData[gardenData.length - 1];
  765. if (item.genera !== pre.materialClass) {
  766. gardenData.push({ materialClass: item.genera, treeData: { ParentID: gardenRoot.treeData.ID }, items: [item] });
  767. } else {
  768. pre.items.push(item);
  769. }
  770. });
  771. handleClassAndItems(gardenData, TableType.GARDEN)
  772. // 绿色节能分类数据:绿色、节能建筑工程材料
  773. const energyData = [{ materialClass: '绿色、节能建筑工程材料', items: energy }];
  774. handleClassAndItems(energyData, TableType.ENERGY);
  775. // 有数据才将地区push入areas中(费用定额共用)
  776. if ((classData.length || priceData.length) && !matchedArea) {
  777. compilationAreas.push({ compilationID, ID: areaID, name: area })
  778. }
  779. return { libData, classData, priceData, compilationAreas };
  780. function handleClassAndItems(sourceData, tableType) {
  781. if (!sourceData) {
  782. return;
  783. }
  784. sourceData.forEach(({ materialClass, treeData, items }) => {
  785. const classItem = {
  786. ID: treeData && treeData.ID || uuidV1(),
  787. ParentID: treeData && treeData.ParentID || '-1',
  788. NextSiblingID: treeData && treeData.NextSiblingID || '-1',
  789. name: materialClass,
  790. libID: libData.ID,
  791. areaID,
  792. };
  793. // 设置上一个节点数据的NextID
  794. let count = 1;
  795. let pre = classData[curClassIndex - 1];
  796. while (pre && pre.ParentID !== classItem.ParentID) {
  797. count++;
  798. pre = classData[curClassIndex - count];
  799. }
  800. if (pre && pre.ParentID === classItem.ParentID) {
  801. pre.NextSiblingID = classItem.ID;
  802. }
  803. curClassIndex++;
  804. classData.push(classItem);
  805. // 转换价格数据
  806. if (items && items.length) {
  807. const newItems = transformPriceItems(libData.ID, classItem.ID, period, areaID, compilationID, items, tableType);
  808. newItems.forEach(item => priceData.push(item));
  809. }
  810. });
  811. }
  812. }
  813. /**
  814. * 转换跟地区相关的数据
  815. * 地区作为期数库的子项
  816. * @param {String} period - 日期: 2020年01月
  817. * @param {String} compilationID - 费用定额ID
  818. * @param {String} className - 分类名称
  819. * @param {Object} libData - 当前期数库数据
  820. * @param {Array<object>} areaData - 各区县地方材料工地价格
  821. * @param {Array<object>} mixedData - 预拌砂浆信息价格
  822. * @return {Object}
  823. */
  824. async function transformAreaData(period, compilationID, libData, areaData, mixedData) {
  825. // 根据地区进行分类
  826. const data = [];
  827. const hashMap = {}; // 保证地区顺序跟网页爬取数据的顺序一致。(object for in无法保证顺序)
  828. function hash(area) {
  829. if (!isDef(hashMap[area])) {
  830. hashMap[area] = Object.keys(hashMap).length
  831. }
  832. return hashMap[area];
  833. }
  834. const areaClass = '地方材料信息价';
  835. const mixedClass = '预拌商品砂浆';
  836. function buildData(sourceData) {
  837. sourceData.forEach(item => {
  838. const idx = hash(item.area);
  839. if (!data[idx]) {
  840. data[idx] = { area: item.area, subData: [] };
  841. }
  842. if (sourceData === areaData) {
  843. // 存在地区数据,需要生成分类“地方材料信息价”
  844. if (!data[idx].subData[0]) {
  845. data[idx].subData[0] = { className: areaClass, items: [] };
  846. }
  847. data[idx].subData[0].items.push(item);
  848. } else if (sourceData === mixedData) {
  849. // 存在地区数据,需要生成分类“地方材料信息价”
  850. if (!data[idx].subData[1]) {
  851. data[idx].subData[1] = { className: mixedClass, items: [] };
  852. }
  853. data[idx].subData[1].items.push(item);
  854. }
  855. });
  856. }
  857. buildData(areaData);
  858. buildData(mixedData);
  859. const compilationAreas = [];
  860. const classData = [];
  861. const priceData = [];
  862. for (const { area, subData } of data) {
  863. const matchedArea = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
  864. // 地区需要serialNo字段,打补丁
  865. const serialNo = areas.indexOf(area) + 1;
  866. if (matchedArea && !matchedArea.serialNo) {
  867. await priceInfoAreaModel.update({ ID: matchedArea.ID }, { $set: { serialNo } });
  868. }
  869. const areaID = matchedArea && matchedArea.ID || uuidV1();
  870. if (!matchedArea) {
  871. compilationAreas.push({ compilationID, serialNo, ID: areaID, name: area });
  872. }
  873. let preClass;
  874. subData.forEach(subItem => {
  875. if (!subItem) {
  876. return;
  877. }
  878. const { className, items } = subItem;
  879. const classItem = {
  880. ID: uuidV1(),
  881. ParentID: '-1',
  882. NextSiblingID: '-1',
  883. name: className,
  884. libID: libData.ID,
  885. areaID,
  886. };
  887. classData.push(classItem);
  888. if (preClass) {
  889. preClass.NextSiblingID = classItem.ID;
  890. }
  891. preClass = classItem;
  892. const newItems = transformPriceItems(libData.ID, classItem.ID, period, areaID, compilationID, items, TableType.AREA);
  893. newItems.forEach(item => priceData.push(item));
  894. });
  895. }
  896. return { classData, priceData, compilationAreas };
  897. }
  898. /**
  899. * 数据入库
  900. * 生成一个通用库及各地区
  901. * @param {String} period 期数 eg: '2020年05月'
  902. * @param {Object} generalData - 主要材料{ building, garden, energy }
  903. * @param {Array<object>} areaData - 各地区材料
  904. * @param {Array<object>} mixedData - 各地区预拌砂浆
  905. */
  906. async function save(period, generalData, areaData, mixedData) {
  907. const overWriteUrl = '/web/over_write/js/chongqing_2018.js';
  908. const compilation = await compilationModel.findOne({ overWriteUrl }, '_id').lean();
  909. if (!compilation) {
  910. throw '没有找到正确配置overWriteUrl的费用定额。';
  911. }
  912. const compilationID = compilation._id;
  913. // 转换数据
  914. const generalSaveData = await transfromGeneralData(period, compilationID, generalData);
  915. const libData = generalSaveData.libData;
  916. const areaSaveData = await transformAreaData(period, compilationID, libData, areaData, mixedData);
  917. // 入库
  918. const classData = [...generalSaveData.classData, ...areaSaveData.classData];
  919. const priceData = [...generalSaveData.priceData, ...areaSaveData.priceData];
  920. const compilationAreas = [...generalSaveData.compilationAreas, ...areaSaveData.compilationAreas]
  921. // 删除已有的相同期数数据
  922. const originalLibs = await priceInfoLibModel.find({ period }, '-_id ID').lean();
  923. const originalLibIDList = originalLibs.reduce((acc, cur) => {
  924. acc.push(cur.ID);
  925. return acc;
  926. }, []);
  927. if (originalLibIDList.length) {
  928. await priceInfoItemModel.deleteMany({ period });
  929. await priceInfoClassModel.deleteMany({ libID: { $in: originalLibIDList } });
  930. await priceInfoLibModel.deleteMany({ period });
  931. }
  932. // 插入数据
  933. if (priceData.length) {
  934. await priceInfoItemModel.insertMany(priceData);
  935. }
  936. if (classData.length) {
  937. await priceInfoClassModel.insertMany(classData);
  938. }
  939. if (libData) {
  940. await priceInfoLibModel.insertMany([libData]);
  941. }
  942. if (compilationAreas) {
  943. await priceInfoAreaModel.insertMany(compilationAreas);
  944. }
  945. }
  946. /**
  947. * 爬取数据
  948. * @param {String} from - 从哪一期开始 eg: 2020-01
  949. * @param {String} to - 从哪一期结束 eg: 2020-05
  950. * @param {String} compilationID - 费用定额ID
  951. * @return {Object}
  952. */
  953. async function crawlData(from, to, compilationID) {
  954. let curPeriod;
  955. try {
  956. const $index = await loadPage(PageType.GENERAL);
  957. const periodData = getPeriodData(from, to, $index);
  958. if (!periodData) {
  959. throw '无效的期数区间。';
  960. }
  961. // 地区补丁
  962. const areaData = await priceInfoAreaModel.find({ compilationID, serialNo: null }).lean();
  963. const bulks = [];
  964. areaData.forEach(areaItem => {
  965. const serialNo = areas.indexOf(areaItem.name) + 1;
  966. bulks.push({
  967. updateOne: {
  968. filter: { ID: areaItem.ID },
  969. update: { serialNo }
  970. }
  971. });
  972. });
  973. if (bulks.length) {
  974. await priceInfoAreaModel.bulkWrite(bulks);
  975. }
  976. // 一期一期爬取数据
  977. debugConsole('allTime', 'time');
  978. for (const periodItem of periodData) {
  979. debugConsole('peroidTime', 'time');
  980. // 爬取主要材料信息价格
  981. const generalData = await crawlGeneralData(periodItem.uid, $index); // 初始页面就是主要材料信息价的页面
  982. // 爬取各区县地方材料工地价格
  983. const areaData = await crawlAreaData(periodItem.uid);
  984. // 爬取预拌砂浆信息价格
  985. const mixedData = await crawlMixedData(periodItem.uid);
  986. // 转换数据并入库
  987. await save(periodItem.period, generalData, areaData, mixedData);
  988. curPeriod = periodItem.period;
  989. debugConsole('peroidTime', 'timeEnd');
  990. }
  991. debugConsole('allTime', 'timeEnd');
  992. } catch (err) {
  993. console.log(err);
  994. // 错误时提示已经成功爬取的期数
  995. let errTip = '';
  996. if (curPeriod) {
  997. errTip += `\n成功爬取期数为:${from}到${curPeriod}`;
  998. }
  999. const errStr = String(err) + errTip;
  1000. console.log(`err`);
  1001. console.log(errStr);
  1002. throw errStr;
  1003. }
  1004. }