chongqing_2018_price_crawler.js 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. /**
  2. * 重庆材料信息价爬虫
  3. * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的网站渲染的是静态内容,因此不需要使用puppeteer。
  4. * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据)
  5. */
  6. module.exports = {
  7. crawlData,
  8. };
  9. const cheerio = require('cheerio');
  10. const axios = require('axios');
  11. const querystring = require('querystring');
  12. // 页面类型
  13. const PageType = {
  14. GENERAL: '/Index.aspx',
  15. AREA: '/AreaIndex.aspx',
  16. MIXED: '/ReadyMixedIndex.aspx',
  17. };
  18. /**
  19. * 获取主要材料信息价格页面表单数据
  20. * @param {Object} $ - 页面内容
  21. * @param {Object} props - 提交属性
  22. */
  23. function getGeneralDataBody($, props) {
  24. const body = {
  25. __EVENTTARGET: props.eventTarget || '',
  26. __EVENTARGUMENT: '',
  27. __VIEWSTATE: $('#__VIEWSTATE').val(),
  28. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  29. ID_ucPrice$linkvv: props.period,
  30. ID_ucPrice$linkcategory: props.materialClass || '',
  31. ID_ucPrice$LinkValue: `${props.classID},${props.period},${props.materialClass || ''}`,
  32. ID_ucPrice$txtsonclass: `sonclass${props.classID}`,
  33. ID_ucPrice$txtfatherclass: $('#ID_ucPrice_txtfatherclass').val(),
  34. ID_ucPrice$txtClassId: props.classID || '',
  35. ID_ucPrice$ddlSearchYear: '请选择',
  36. ID_ucPrice$ddlSearchMonth: '请选择',
  37. ID_ucPrice$txtSearchCailiao: '',
  38. ID_ucPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  39. };
  40. if (!props.eventTarget) {
  41. body.ID_ucPrice$btnLink = $('#ID_ucPrice_btnLink').val();
  42. }
  43. return body;
  44. }
  45. /**
  46. * 获取各区县地方材料工地价格页面表单数据
  47. * @param {Object} $ - 页面内容
  48. * @param {Object} props - 提交属性
  49. */
  50. function getAreaDataBody($, props) {
  51. if (!props || !Object.keys(props).length) {
  52. return {};
  53. }
  54. const body = {
  55. __EVENTTARGET: props.eventTarget || '',
  56. __EVENTARGUMENT: '',
  57. __VIEWSTATE: $('#__VIEWSTATE').val(),
  58. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  59. ID_ucAreaPrice$linkvv: props.period,
  60. ID_ucAreaPrice$LinkValue: '',
  61. ID_ucAreaPrice$dropArea: 'code',
  62. ID_ucAreaPrice$txtSearchCailiao: '',
  63. ID_ucAreaPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  64. };
  65. if (!props.eventTarget) {
  66. body.ID_ucAreaPrice$btnAreaMaster = 'Button';
  67. }
  68. return body;
  69. }
  70. /**
  71. * 获取预拌砂浆信息价格页面表单数据
  72. * @param {Object} $ - 页面内容
  73. * @param {Object} props - 提交属性
  74. */
  75. function getMixedDataBody($, props) {
  76. if (!props || !Object.keys(props).length) {
  77. return {};
  78. }
  79. const body = {
  80. __EVENTTARGET: props.eventTarget || '',
  81. __EVENTARGUMENT: '',
  82. __VIEWSTATE: $('#__VIEWSTATE').val(),
  83. __VIEWSTATEGENERATOR: $('#__VIEWSTATEGENERATOR').val(),
  84. ID_ucReadyMixedPrice$linkvv: props.period,
  85. ID_ucReadyMixedPrice$LinkValue: '',
  86. ID_ucReadyMixedPrice$dropArea: 'code',
  87. ID_ucReadyMixedPrice$txtSearchCailiao: '',
  88. ID_ucReadyMixedPrice$UcPager1$listPage: props.page && String(props.page) || '1',
  89. };
  90. if (!props.eventTarget) {
  91. body.ID_ucReadyMixedPrice$btnAreaMaster = 'Button';
  92. }
  93. return body;
  94. }
  95. // 获取提交
  96. const TIME_OUT = 10000;
  97. // 创建axios实例
  98. const axiosInstance = axios.create({
  99. baseURL: 'http://www.cqsgczjxx.org/Jgxx/',
  100. timeout: TIME_OUT,
  101. proxy: {
  102. host: "127.0.0.1", port: "8888" // fiddler抓包
  103. },
  104. headers: {
  105. 'Cache-Control': 'max-age=0',
  106. 'Content-Type': 'application/x-www-form-urlencoded',
  107. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
  108. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  109. 'Accept-Encoding': 'gzip, deflate',
  110. 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
  111. },
  112. responseType: 'document'
  113. });
  114. // 响应拦截器
  115. axiosInstance.interceptors.response.use(function (response) {
  116. return response;
  117. }, function (error) {
  118. // 对响应错误做点什么
  119. if (error.message.includes('timeout')) {
  120. return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
  121. } else {
  122. return Promise.reject(error);
  123. }
  124. });
  125. // 发起请求需要携带Cookie,否则一些请求会返回500错误(应该是网站的反爬措施)
  126. let curCookie = '';
  127. /**
  128. * 加载页面,获取可用类jquery操作的数据
  129. * @param {String} url - 拼接的url
  130. * @param {Object} body - 表单数据
  131. * @return {DOM-LIKE} - cheerio解析html得到的类dom数据
  132. */
  133. async function loadPage(url, body) {
  134. const config = {};
  135. if (curCookie) {
  136. config.headers = { Cookie: curCookie };
  137. }
  138. const rst = body ?
  139. await axiosInstance.post(url, querystring.stringify(body), config) :
  140. await axiosInstance.post(url, null, config);
  141. // 更新cookie
  142. const cookies = rst.headers['set-cookie'];
  143. if (Object.prototype.toString.call(cookies) === '[object Array]') {
  144. curCookie = cookies[0].split(';')[0];
  145. }
  146. return cheerio.load(rst.data);
  147. }
  148. const monthMap = {
  149. '1': '01',
  150. '2': '02',
  151. '3': '03',
  152. '4': '04',
  153. '5': '05',
  154. '6': '06',
  155. '7': '07',
  156. '8': '08',
  157. '9': '09',
  158. '10': '10',
  159. '11': '11',
  160. '12': '12',
  161. };
  162. /**
  163. * 获取期数数据
  164. * @param {String} from - 从哪一期开始 eg: 2020-01
  165. * @param {String} to - 从哪一期结束 eg: 2020-05
  166. * @param {Object} $index - cheerio加载的初始页面内容
  167. * @return {Array[object] || Null} eg: {period: '2020-05', uid: 'XCCXXXXX-XX'}
  168. */
  169. function getPeriodData(from, to, $index) {
  170. if (from > to) {
  171. return null;
  172. }
  173. const $period = $index('#PriceLMenu')
  174. // 根据区间获取期数列表
  175. const reg = /(\d+)-(\d+)/;
  176. const fromMatch = from.match(reg);
  177. const fromYear = +fromMatch[1];
  178. const fromMonth = +fromMatch[2];
  179. const toMatch = to.match(reg);
  180. const toYear = +toMatch[1];
  181. const toMonth = +toMatch[2];
  182. let curYear = fromYear;
  183. let curMonth = fromMonth;
  184. const list = [];
  185. while (curYear <= toYear && curMonth <= toMonth) {
  186. const uid = getPeriodUID(curYear, curMonth, $period);
  187. // 存在无效期数,直接返回空
  188. if (!uid) {
  189. return null;
  190. }
  191. list.push({
  192. period: `${curYear}-${monthMap[curMonth]}`,
  193. uid
  194. });
  195. if (curMonth === 12) {
  196. curYear++;
  197. curMonth = 1;
  198. } else {
  199. curMonth++;
  200. }
  201. }
  202. return list;
  203. function getPeriodUID(year, month, $period) {
  204. const $year = $period.find('.MenuOneTitle').filter(function () {
  205. return $index(this).text() === `${year}年`;
  206. });
  207. if (!$year.length) {
  208. return null;
  209. }
  210. const $month = $year.parent().next().find('a').filter(function () {
  211. return $index(this).text() === `${month}月`;
  212. });
  213. if (!$month.length) {
  214. return null;
  215. }
  216. // 期数uid在onclick中,需要提取出来
  217. const onclickText = $month.attr('onclick').toString();
  218. const reg = /Onlink\('([^']+)'/;
  219. const matched = onclickText.match(reg);
  220. if (!matched || !matched[1]) {
  221. return null;
  222. }
  223. return matched[1];
  224. }
  225. }
  226. // 表格类型
  227. const TableType = {
  228. BUILDING: 1, // 主要材料中的建安工程材料和绿色
  229. GARDEN: 2, // 主要材料中的园林绿化
  230. ENERGY: 3, // 主要材料中的节能建筑工程材料
  231. AREA: 4, // 地区相关(各区县材料)
  232. MIXED: 5, // 地区相关(预拌砂浆)
  233. };
  234. /**
  235. * 爬取表格数据
  236. * @param {Object} $page - 页面内容
  237. * @param {Number} type - 表格类型
  238. * @return {Array[object]}
  239. */
  240. function crawlTableData($page, type) {
  241. switch (type) {
  242. case TableType.BUILDING:
  243. case TableType.ENERGY:
  244. return crawlNormalTable($page);
  245. case TableType.GARDEN:
  246. return crawlGardenTable($page);
  247. case TableType.AREA:
  248. return crawlAreaTable($page, '#ID_ucAreaPrice_gridView');
  249. case TableType.MIXED:
  250. return crawlAreaTable($page, '#ID_ucReadyMixedPrice_gridView');
  251. }
  252. return [];
  253. }
  254. /**
  255. * 爬取表格数据,表格列为:
  256. * 序号 | 材料名称 | 规格型号 | 单位 | 含税价(元) | 不含税价(元) | 备注
  257. * @param {Object} $page - 页面内容
  258. * @return {Array[object]}
  259. */
  260. function crawlNormalTable($page) {
  261. const colMap = {
  262. 0: 'name',
  263. 1: 'specs',
  264. 2: 'unit',
  265. 3: 'taxPrice',
  266. 4: 'noTaxPrice',
  267. 5: 'remark'
  268. };
  269. const data = [];
  270. let cur;
  271. const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
  272. $tdList.each(function (index) {
  273. const col = index % 6;
  274. if (col === 0) {
  275. cur = {}
  276. }
  277. cur[colMap[col]] = $page(this).text();
  278. if (col === 5) {
  279. data.push(cur);
  280. }
  281. });
  282. console.log(data);
  283. return data;
  284. }
  285. /**
  286. * 爬取表格数据,表格列为:
  287. * 序号 | 科属 | 品名 | 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 单位 | 含税价(元) | 不含税价(元) | 备注
  288. * @param {Object} $page - 页面内容
  289. * @return {Array[object]}
  290. */
  291. function crawlGardenTable($page) {
  292. const colMap = {
  293. 0: 'genera',
  294. 1: 'name',
  295. 2: 'height',
  296. 3: 'branchDiameter',
  297. 4: 'crownDiameter',
  298. 5: 'branchHeight',
  299. 6: 'unit',
  300. 7: 'taxPrice',
  301. 8: 'noTaxPrice',
  302. 9: 'remark',
  303. };
  304. const data = [];
  305. let cur;
  306. const $tdList = $page('#ID_ucPrice_gridView').find('tr td span').filter(index => index % 11 !== 0); // 排除表头和序号列
  307. $tdList.each(function (index) {
  308. const col = index % 10;
  309. if (col === 0) {
  310. cur = {}
  311. }
  312. cur[colMap[col]] = $page(this).text();
  313. if (col === 9) {
  314. data.push(cur);
  315. }
  316. });
  317. console.log(data);
  318. return data;
  319. }
  320. /**
  321. * 爬取表格数据,表格列为:
  322. * 序号 | 所属区县 | 材料名称 | 规格及型号 | 计量单位 | 含税价(元) | 不含税价(元)
  323. * @param {Object} $page - 页面内容
  324. * @param {String} viewSelector - 表格选择器(ID)
  325. * @return {Array[object]}
  326. */
  327. function crawlAreaTable($page, viewSelector) {
  328. const colMap = {
  329. 0: 'area',
  330. 1: 'name',
  331. 2: 'specs',
  332. 3: 'unit',
  333. 4: 'taxPrice',
  334. 5: 'noTaxPrice',
  335. };
  336. const data = [];
  337. let cur;
  338. const $tdList = $page(viewSelector).find('tr td span').filter(index => index % 7 !== 0); // 排除表头和序号列
  339. $tdList.each(function (index) {
  340. const col = index % 6;
  341. if (col === 0) {
  342. cur = {}
  343. }
  344. cur[colMap[col]] = $page(this).text();
  345. if (col === 5) {
  346. data.push(cur);
  347. }
  348. });
  349. console.log(data);
  350. return data;
  351. }
  352. // 事件触发类型
  353. const EventTarget = {
  354. GENERAL_NEXT: 'ID_ucPrice$UcPager1$btnNext',
  355. AREA_NEXT: 'ID_ucAreaPrice$UcPager1$btnNext',
  356. MIXED_NEXT: 'ID_ucReadyMixedPrice_UcPager1_btnNext',
  357. };
  358. /**
  359. * 爬取一页一页的表格数据
  360. * @param {Object} $index - 索引页面内容
  361. * @param {Object} props - 提交的表单内容
  362. * @param {String} pageType - 页面类型
  363. * @param {Number} tableType - 表格类型
  364. */
  365. async function crawlPagesData($index, props, pageType, tableType) {
  366. let body;
  367. let pageStateSelector;
  368. if (pageType === PageType.GENERAL) {
  369. body = getGeneralDataBody($index, props);
  370. pageStateSelector = '#ID_ucPrice_UcPager1_lbPage';
  371. } else if (pageType === PageType.AREA) {
  372. body = getAreaDataBody($index, props);
  373. pageStateSelector = '#ID_ucAreaPrice_UcPager1_lbPage';
  374. } else {
  375. body = getMixedDataBody($index, props);
  376. pageStateSelector = '#ID_ucReadyMixedPrice_UcPager1_lbPage';
  377. }
  378. const $firstPage = await loadPage(pageType, body);
  379. const rst = [];
  380. // 获取第一页数据
  381. rst.push(...crawlTableData($firstPage, tableType));
  382. // 获取除第一页的数据
  383. // 获取页码
  384. const pageState = $firstPage(pageStateSelector).text(); // eg: 1/10
  385. const totalPage = +pageState.split('/')[1];
  386. const asyncCount = 6; // 最高批量次数
  387. let curCount = 0;
  388. let task = [];
  389. for (let page = 1; page < totalPage; page++) {
  390. task.push(crawlPageData(page));
  391. curCount++;
  392. if (curCount === asyncCount) {
  393. const allData = await Promise.all(task);
  394. allData.forEach(data => rst.push(...data));
  395. curCount = 0;
  396. task = [];
  397. }
  398. }
  399. if (task.length) {
  400. const allData = await Promise.all(task);
  401. allData.forEach(data => rst.push(...data));
  402. }
  403. return rst;
  404. // 爬取页码数据
  405. async function crawlPageData(page) {
  406. const pageProps = { ...props, page };
  407. let body;
  408. if (pageType === PageType.GENERAL) {
  409. pageProps.eventTarget = EventTarget.GENERAL_NEXT;
  410. body = getGeneralDataBody($firstPage, pageProps);
  411. } else if (pageType === PageType.AREA) {
  412. pageProps.eventTarget = EventTarget.AREA_NEXT;
  413. body = getAreaDataBody($firstPage, pageProps);
  414. } else {
  415. pageProps.eventTarget = EventTarget.MIXED_NEXT;
  416. body = getMixedDataBody($firstPage, pageProps);
  417. }
  418. const $page = await loadPage(pageType, body);
  419. return crawlTableData($page, tableType);
  420. }
  421. }
  422. /**
  423. * 爬取建安工程材料和绿色、园林绿化工程材料、节能建筑工程材料
  424. * @param {String} period - 期数uid
  425. * @param {String} classID - 工程分类id
  426. * @param {Object} $index - 初始页面内容
  427. * @param {Number} type - 表格类型
  428. * @return {Array[object]} eg: [{ materialClass: '一、黑色及有色金属', items: [...] }]
  429. */
  430. async function crawlGeneralSubData(period, classID, $index, type) {
  431. const body = getGeneralDataBody($index, { period, classID });
  432. console.time('crawlGeneralSubData');
  433. const $engineeringClassPage = await loadPage(PageType.GENERAL, body);
  434. const rst = [];
  435. if (type === TableType.BUILDING) {
  436. const classList = crawlMaterialClassList($index('#ID_ucPrice_CategoryLabel'));
  437. if (!classList.length) {
  438. throw '无法爬取到材料分类。';
  439. }
  440. console.log(classList);
  441. for (const materialClass of classList) {
  442. const obj = { materialClass, items: [] };
  443. obj.items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass }, PageType.GENERAL, type);
  444. rst.push(obj);
  445. }
  446. } else {
  447. const items = await crawlPagesData($engineeringClassPage, { period, classID, materialClass: '' }, PageType.GENERAL, type);
  448. rst.push(...items);
  449. }
  450. console.timeEnd('crawlGeneralSubData');
  451. // 爬取材料分类表
  452. function crawlMaterialClassList($class) {
  453. const list = [];
  454. $class.find('a').each(function () {
  455. const text = $engineeringClassPage(this).text();
  456. list.push(text);
  457. });
  458. return list;
  459. }
  460. }
  461. /**
  462. * 爬取主要材料信息价格(这部分作为通用库)
  463. * @param {String} period - 期数uid
  464. * @param {Object} $index - 初始页面内容
  465. * @return {Object}
  466. */
  467. async function crawlGeneralData(period, $index) {
  468. const { building, garden, energy } = crawlClass($index('#ID_ucPrice_tabNewBar'));
  469. const rst = {};
  470. if (building) {
  471. rst.building = await crawlGeneralSubData(period, building, $index, TableType.BUILDING);
  472. }
  473. if (garden) {
  474. // 园林绿化工程材料下的数据所属分类为数据的"科属"列
  475. rst.garden = await crawlGeneralSubData(period, garden, $index, TableType.GARDEN);
  476. }
  477. if (energy) {
  478. // 绿色、节能建筑工程材料下的所有数据,所属分类均为“绿色、节能建筑工程材料”。
  479. rst.energy = await crawlGeneralSubData(period, energy, $index, TableType.ENERGY);
  480. }
  481. return rst;
  482. // 爬取工程分类
  483. function crawlClass($class) {
  484. // 工程分类
  485. let building; // 建安工程材料
  486. let garden; // 园林绿化工程材料
  487. let energy; // 绿色、节能建筑工程材料
  488. const reg = /OnClassson\('([^']+)'/;
  489. $class.find('a').each(function () {
  490. const text = $index(this).text();
  491. const onclickText = $index(this).attr('onclick').toString();
  492. const matched = onclickText.match(reg);
  493. if (!matched || !matched[1]) {
  494. throw '无法爬取到工程分类。';
  495. }
  496. if (text === '建安工程材料') {
  497. building = matched[1];
  498. } else if (text === '园林绿化工程材料') {
  499. garden = matched[1];
  500. } else if (text === '绿色、节能建筑工程材料') {
  501. energy = matched[1];
  502. }
  503. });
  504. return { building, garden, energy };
  505. }
  506. }
  507. /**
  508. * 爬取各区县地方材料工地价格
  509. * @param {String} period - 期数uid
  510. * @return {Array[objecy]
  511. */
  512. async function crawlAreaData(period) {
  513. // 获取各区材料初始页
  514. const $index = await loadPage(PageType.AREA);
  515. // 获取地区材料
  516. return await crawlPagesData($index, { period }, PageType.AREA, TableType.AREA);
  517. }
  518. /**
  519. * 爬取预拌砂浆信息价格
  520. * @param {String} period - 期数uid
  521. * @return {Array[objecy]
  522. */
  523. async function crawlMixedData(period) {
  524. // 获取各区材料初始页
  525. const $index = await loadPage(PageType.MIXED);
  526. // 获取地区材料
  527. return await crawlPagesData($index, { period }, PageType.MIXED, TableType.MIXED);
  528. }
  529. /**
  530. *
  531. * @param {String} period 期数 eg: '2020-05'
  532. * @param {Object} generalData - 主要材料{ building, garden, energy }
  533. * @param {Array[object]} areaData - 各地区材料
  534. * @param {Array[object]} mixedData - 各地区预拌砂浆
  535. */
  536. function transfromAndSave(period, generalData, areaData, mixedData) {
  537. }
  538. /**
  539. * 爬取数据
  540. * @param {String} from - 从哪一期开始 eg: 2020-01
  541. * @param {String} to - 从哪一期结束 eg: 2020-05
  542. * @return {Object}
  543. */
  544. async function crawlData(from, to) {
  545. let curPeriod;
  546. try {
  547. const $index = await loadPage(PageType.GENERAL);
  548. const periodData = getPeriodData(from, to, $index);
  549. if (!periodData) {
  550. throw '无效的期数区间。';
  551. }
  552. console.log(periodData);
  553. // 一期一期爬取数据
  554. for (const periodItem of periodData) {
  555. // 爬取主要材料信息价格
  556. const generalData = await crawlGeneralData(periodItem.uid, $index); // 初始页面就是主要材料信息价的页面
  557. // 爬取各区县地方材料工地价格
  558. const areaData = await crawlAreaData(periodItem.uid);
  559. // 爬取预拌砂浆信息价格
  560. const mixedData = await crawlMixedData(periodItem.uid);
  561. curPeriod = periodItem.period;
  562. }
  563. } catch (err) {
  564. // 错误时提示已经成功爬取的期数
  565. let errTip = '';
  566. if (curPeriod) {
  567. errTip += `\n成功爬取期数为:${from}到${curPeriod}`;
  568. }
  569. const errStr = String(err) + errTip;
  570. console.log(`err`);
  571. console.log(errStr);
  572. }
  573. }