guangdong_2018_price_crawler.js 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. /*
  2. * @Descripttion: 广东材料信息价格获取(通过造价通接口)
  3. * @Author: vian
  4. * @Date: 2020-09-29 11:22:59
  5. */
  6. module.exports = {
  7. crawlData,
  8. };
  9. const uuidV1 = require('uuid/v1');
  10. const mongoose = require('mongoose');
  11. const axios = require('axios');
  12. const querystring = require('querystring');
  13. const priceInfoLibModel = mongoose.model('std_price_info_lib');
  14. const priceInfoClassModel = mongoose.model('std_price_info_class');
  15. const priceInfoAreaModel = mongoose.model('std_price_info_areas');
  16. const priceInfoItemModel = mongoose.model('std_price_info_items');
  17. const priceInfoSourceModel = mongoose.model('std_price_info_source');
  18. const gljLibModel = mongoose.model('std_glj_lib_map');
  19. const gljClassModel = mongoose.model('std_glj_lib_gljClass');
  20. // 造价通网页上整理的地区https://gd.zjtcn.com/gov/c_cs_d_t_p1.html
  21. const areas = [
  22. { city: '广州市', county: '广州市' },
  23. { city: '广州市', county: '花都区' },
  24. { city: '广州市', county: '增城市' },
  25. { city: '广州市', county: '从化市' },
  26. { city: '韶关市', county: '韶关市' },
  27. { city: '韶关市', county: '始兴县' },
  28. { city: '韶关市', county: '翁源县' },
  29. { city: '韶关市', county: '新丰县' },
  30. { city: '韶关市', county: '乐昌市' },
  31. { city: '韶关市', county: '南雄市' },
  32. { city: '深圳市', county: '深圳市' },
  33. { city: '珠海市', county: '珠海市' },
  34. { city: '汕头市', county: '汕头市' },
  35. { city: '汕头市', county: '濠江区' },
  36. { city: '汕头市', county: '潮阳区' },
  37. { city: '汕头市', county: '潮南区' },
  38. { city: '汕头市', county: '澄海区' },
  39. { city: '汕头市', county: '南澳县' },
  40. { city: '佛山市', county: '佛山市' },
  41. { city: '佛山市', county: '南海区' },
  42. { city: '佛山市', county: '顺德区' },
  43. { city: '江门市', county: '江门市' },
  44. { city: '江门市', county: '新会区' },
  45. { city: '江门市', county: '台山市' },
  46. { city: '江门市', county: '开平市' },
  47. { city: '江门市', county: '鹤山市' },
  48. { city: '江门市', county: '恩平市' },
  49. { city: '湛江市', county: '湛江市' },
  50. { city: '湛江市', county: '遂溪县' },
  51. { city: '湛江市', county: '徐闻县' },
  52. { city: '湛江市', county: '廉江市' },
  53. { city: '湛江市', county: '雷州市' },
  54. { city: '湛江市', county: '吴川市' },
  55. { city: '茂名市', county: '茂名市' },
  56. { city: '茂名市', county: '电白市' },
  57. { city: '茂名市', county: '高州市' },
  58. { city: '茂名市', county: '化州市' },
  59. { city: '茂名市', county: '信宜市' },
  60. { city: '肇庆市', county: '肇庆市' },
  61. { city: '肇庆市', county: '鼎湖区' },
  62. { city: '肇庆市', county: '广宁县' },
  63. { city: '肇庆市', county: '怀集县' },
  64. { city: '肇庆市', county: '封开县' },
  65. { city: '肇庆市', county: '德庆县' },
  66. { city: '肇庆市', county: '高要市' },
  67. { city: '肇庆市', county: '四会市' },
  68. { city: '惠州市', county: '惠州市' },
  69. { city: '惠州市', county: '惠阳区' },
  70. { city: '惠州市', county: '大亚湾开发区' },
  71. { city: '惠州市', county: '博罗县' },
  72. { city: '惠州市', county: '惠东县' },
  73. { city: '惠州市', county: '龙门县' },
  74. { city: '梅州市', county: '梅州市' },
  75. { city: '梅州市', county: '梅县' },
  76. { city: '梅州市', county: '大埔县' },
  77. { city: '梅州市', county: '丰顺县' },
  78. { city: '梅州市', county: '平远县' },
  79. { city: '汕尾市', county: '汕尾市' },
  80. { city: '汕尾市', county: '海丰县' },
  81. { city: '汕尾市', county: '陆河县' },
  82. { city: '河源市', county: '河源市' },
  83. { city: '河源市', county: '紫金县' },
  84. { city: '河源市', county: '龙川县' },
  85. { city: '河源市', county: '连平县' },
  86. { city: '河源市', county: '和平县' },
  87. { city: '河源市', county: '东源县' },
  88. { city: '阳江市', county: '阳江市' },
  89. { city: '阳江市', county: '海陵岛区' },
  90. { city: '阳江市', county: '阳西县' },
  91. { city: '阳江市', county: '阳春市' },
  92. { city: '清远市', county: '清远市' },
  93. { city: '清远市', county: '佛冈县' },
  94. { city: '清远市', county: '阳山县' },
  95. { city: '清远市', county: '连山县' },
  96. { city: '清远市', county: '连南县' },
  97. { city: '清远市', county: '英德市' },
  98. { city: '清远市', county: '连州市' },
  99. { city: '东莞市', county: '东莞市' },
  100. { city: '中山市', county: '中山市' },
  101. { city: '潮州市', county: '潮州市' },
  102. { city: '潮州市', county: '潮安县' },
  103. { city: '潮州市', county: '饶平县' },
  104. { city: '揭阳市', county: '揭阳市' },
  105. { city: '揭阳市', county: '揭西县' },
  106. { city: '揭阳市', county: '惠来县' },
  107. { city: '揭阳市', county: '普宁市' },
  108. { city: '云浮市', county: '云浮市' },
  109. { city: '云浮市', county: '新兴县' },
  110. { city: '云浮市', county: '郁南县' },
  111. { city: '云浮市', county: '罗定市' },
  112. ];
  113. const TIME_OUT = 120000;
  114. // 创建axios实例
  115. const axiosConfig = {
  116. baseURL: 'http://api.zjtcn.com/user',
  117. timeout: TIME_OUT,
  118. /* proxy: {
  119. host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error
  120. }, */
  121. headers: {
  122. 'Cache-Control': 'max-age=0',
  123. 'Content-Type': 'application/x-www-form-urlencoded',
  124. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
  125. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  126. 'Accept-Encoding': 'gzip, deflate',
  127. 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
  128. },
  129. //responseType: 'document'
  130. };
  131. const axiosInstance = axios.create(axiosConfig);
  132. console.log('axiosConfig=-========================================');
  133. console.log(axiosConfig);
  134. // 响应拦截器
  135. axiosInstance.interceptors.response.use(function (response) {
  136. return response.data;
  137. }, function (error) {
  138. // 对响应错误做点什么
  139. if (error.message.includes('timeout')) {
  140. return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
  141. } else {
  142. return Promise.reject(error);
  143. }
  144. });
  145. async function post(url, data) {
  146. if (url === '/dyn_code') {
  147. console.log(axiosInstance);
  148. }
  149. return await axiosInstance.post(url, querystring.stringify(data));
  150. }
  151. // 有效期一年,通过购买造价通服务获得
  152. const SERVICE_ID = '2020090003';
  153. const SERVICE_KEY = '97F2A441633F10DFEB5BFC29B3862847';
  154. // 获取后续获取信息价期刊必要的token
  155. async function getToken() {
  156. // 获取动态码
  157. const dynData = await post('/dyn_code', { service_id: SERVICE_ID });
  158. if (!dynData.response_code) {
  159. throw `错误代号${dynData.retCode} 获取动态码失败。`;
  160. }
  161. // 获取加密字符串
  162. const serviceSecret = await post('/aes', { service_id: SERVICE_ID, service_key: SERVICE_KEY, service_code: dynData.response_code });
  163. if (typeof serviceSecret !== 'string') {
  164. throw `错误代号${serviceSecret.retCode} 获取加密字符串错误。`;
  165. }
  166. const tokenData = await post('/authentication', { service_id: SERVICE_ID, service_secret: serviceSecret });
  167. if (!tokenData.token) {
  168. throw `错误代号${tokenData.retCode} 获取token失败。`
  169. }
  170. return tokenData.token;
  171. }
  172. const monthMap = {
  173. '1': '01月',
  174. '2': '02月',
  175. '3': '03月',
  176. '4': '04月',
  177. '5': '05月',
  178. '6': '06月',
  179. '7': '07月',
  180. '8': '08月',
  181. '9': '09月',
  182. '10': '10月',
  183. '11': '11月',
  184. '12': '12月',
  185. };
  186. // 根据期数范围,获取期数数据
  187. function getPeriodData(from, to) {
  188. if (from > to) {
  189. return null;
  190. }
  191. const reg = /(\d+)-(\d+)/;
  192. const fromMatch = from.match(reg);
  193. const fromYear = +fromMatch[1];
  194. const fromMonth = +fromMatch[2];
  195. const toMatch = to.match(reg);
  196. const toYear = +toMatch[1];
  197. const toMonth = +toMatch[2];
  198. let curYear = fromYear;
  199. let curMonth = fromMonth;
  200. const periods = [];
  201. while (curYear <= toYear && curMonth <= toMonth) {
  202. periods.push(`${curYear}年-${monthMap[curMonth]}`);
  203. if (curMonth === 12) {
  204. curYear++;
  205. curMonth = 1;
  206. } else {
  207. curMonth++;
  208. }
  209. }
  210. return periods;
  211. }
  212. // 根据期刊数据,获取需要信息价接口需要的date
  213. function getDateForApi(journalList, period) {
  214. const monthPeriod = `${period}-05`; // 月度
  215. const matchMonth = journalList.find(dateItem => dateItem.date === monthPeriod);
  216. if (matchMonth) {
  217. return matchMonth.date;
  218. }
  219. // 没匹配到月度数据,去匹配季度
  220. const month = period.split('-')[1];
  221. let quaterDate;
  222. if (['1', '01', '2', '02', '3', '03'].includes(month)) {
  223. quaterDate = '03-15';
  224. } else if (['4', '04', '5', '05', '6', '06'].includes(month)) {
  225. quaterDate = '06-15';
  226. } else if (['7', '07', '8', '08', '9', '09'].includes(month)) {
  227. quaterDate = '09-15';
  228. } else if (['10', '11', '12'].includes(month)) {
  229. quaterDate = '12-15';
  230. }
  231. const year = period.split('-')[0];
  232. const matchQuater = journalList.find(dateItem => dateItem.date === `${year}-${quaterDate}`);
  233. if (matchQuater) {
  234. return matchQuater.date;
  235. }
  236. // 没匹配到季度数据,去匹配半年数据
  237. if (month / 6 <= 1 ) {
  238. const firstHalfYear = journalList.find(dateItem => dateItem.date === `${year}-06-25`);
  239. if (firstHalfYear) {
  240. return firstHalfYear.date;
  241. }
  242. }
  243. if (month /6 > 1) {
  244. const secondHalfYear = journalList.find(dateItem => dateItem.date === `${year}-12-25`);
  245. if (secondHalfYear) {
  246. return secondHalfYear.date;
  247. }
  248. }
  249. // 匹配全年数据
  250. const fullYear = journalList.find(dateItem => dateItem.date === `${year}-12-30`);
  251. if (fullYear) {
  252. return fullYear;
  253. }
  254. return null;
  255. }
  256. // 获取信息价
  257. async function getPriceInfoSource(token, period, city, county) {
  258. const province = '广东';
  259. const area = `${province}-${city}-${county}`;
  260. const industry = 1;
  261. const body = {
  262. token,
  263. province,
  264. city,
  265. county,
  266. industry,
  267. // date: `${period}-05` // 天数05表示请求月度数据
  268. };
  269. // 获取期刊数据
  270. const year = period.split('-')[0];
  271. const journalRst = await post('/gov/journal_list', { ...body, date: year });
  272. if (!journalRst || !journalRst.results) {
  273. // 不抛出错误,不同地区更新信息价期刊的时间不同,如果导入数据时,有地区没发布数据,直接跳过并提示
  274. return `retCode: ${journalRst.retCode} ${journalRst.msg} (${period} ${city} ${county})`;
  275. }
  276. const date = getDateForApi(journalRst.results, period);
  277. if (!date) {
  278. return `retCode: 1000 暂无数据 (${period} ${city} ${county})`;
  279. }
  280. const sourceData = await post('/gov/get', { ...body, date });
  281. if (!sourceData.results) {
  282. // 不抛出错误,不同地区更新信息价期刊的时间不同,如果导入数据时,有地区没发布数据,直接跳过并提示
  283. return `retCode: ${sourceData.retCode} ${sourceData.msg} (${period} ${city} ${county})`;
  284. }
  285. // 因为造价通接口请求有次数限制,一个地区只能请求一次,为保险起见,将造价通源数据入库
  286. const insertData = sourceData.results.map(item => ({
  287. period,
  288. area,
  289. industry,
  290. subcid: item.subcid,
  291. code: item.code,
  292. name: item.name,
  293. unit: item.unit,
  294. price: item.price,
  295. taxPrice: item.tax_price,
  296. noTaxPrice: item.no_tax_price,
  297. specs: item.spec,
  298. remark: item.notes,
  299. }));
  300. /* if (insertData.length) {
  301. await priceInfoSourceModel.insertMany(insertData);
  302. } */
  303. return insertData;
  304. }
  305. // 获取数据subcid与分类名称的映射表
  306. async function getClassNameMap(compilationID) {
  307. const gljLib = await gljLibModel.findOne({ compilationId: compilationID }).lean();
  308. if (!gljLib) {
  309. return null;
  310. }
  311. const classData = await gljClassModel.find({ repositoryId: gljLib.ID }).lean();
  312. const map = {};
  313. const reg = /^\d{4}/;
  314. classData.forEach(item => {
  315. const name = item.Name || '';
  316. const matched = name.match(reg);
  317. if (matched) {
  318. map[matched[0]] = name;
  319. }
  320. });
  321. return map;
  322. }
  323. /**
  324. * 将信息价源数据转换入库
  325. * @param {String} compilationID - 费用定额ID
  326. * @param {String} period - 期数 eg: 2020年-09月
  327. * @param {String} areaID - 地区ID
  328. * @param {Array} sourceData - 造价通源数据
  329. * @param {Object} classNameMap - 从标准人材机分类树获取的编号-名称映射表
  330. * @return {Void}
  331. */
  332. async function saveData(compilationID, period, areaID, sourceData, classNameMap) {
  333. let lib = await priceInfoLibModel.findOne({ compilationID, period }).lean();
  334. if (!lib) {
  335. lib = {
  336. compilationID,
  337. period,
  338. ID: uuidV1(),
  339. name: `信息价(${period})`,
  340. createDate: Date.now()
  341. };
  342. await priceInfoLibModel.insertMany([lib]);
  343. }
  344. const libID = lib.ID;
  345. // 如果该期数该地区下存在数据,则不处理,防止重复插入数据
  346. // 造价通地区数据更新不同步,可能需要多次导入数据补全一期数据,如果已经有数据,说明该地区已经导入成功过了,直接跳过
  347. const existCount = await priceInfoItemModel.count({ compilationID, period, areaID });
  348. if (existCount) {
  349. return;
  350. }
  351. // 分类数据应为空才对,如果有就清空
  352. const existClassCount = await priceInfoClassModel.count({ libID: lib.ID, areaID });
  353. if (existClassCount) {
  354. await priceInfoClassModel.remove({ libID: lib.ID, areaID });
  355. }
  356. // 导入分类数据及价格信息数据
  357. const otherClassName = '其他';
  358. const curClassMap = {
  359. [otherClassName]: { libID, areaID, ID: uuidV1(), ParentID: '-1', name: otherClassName }
  360. };
  361. const classData = [];
  362. const priceData = [];
  363. const splitReg = /([0-9.]+)-([0-9.]+)/;
  364. let needOtherClass = false;
  365. sourceData.forEach(item => {
  366. const className = classNameMap[item.subcid] || otherClassName;
  367. if (className === otherClassName) {
  368. needOtherClass = true;
  369. }
  370. if (!curClassMap[className]) {
  371. const classItem = { libID, areaID, ID: uuidV1(), ParentID: '-1', NextSiblingID: '-1', name: className };
  372. curClassMap[className] = classItem;
  373. /* const preClassData = classData[classData.length - 1];
  374. if (preClassData) {
  375. preClassData.NextSiblingID = classItem.ID;
  376. } */
  377. classData.push(classItem);
  378. }
  379. const classID = curClassMap[className].ID;
  380. const price = item.price || '';
  381. const matchSplitPrice = price.match(splitReg);
  382. if (matchSplitPrice) { // 价格字段是区间,需要分割成最低价、最高价两条数据
  383. const minPrice = matchSplitPrice[1];
  384. const maxPrice = matchSplitPrice[2];
  385. priceData.push(transfromSourceItemToPriceItem(item, classID, `${item.name}-最低价`, minPrice));
  386. priceData.push(transfromSourceItemToPriceItem(item, classID, `${item.name}-最高价`, maxPrice));
  387. } else {
  388. priceData.push(transfromSourceItemToPriceItem(item, classID, item.name, item.noTaxPrice));
  389. }
  390. });
  391. // 分类按分类编号排序
  392. classData.sort((a, b) => a.name.localeCompare(b.name));
  393. classData.forEach((classItem, index) => {
  394. const preClassItem = classData[index - 1];
  395. if (preClassItem) {
  396. preClassItem.NextSiblingID = classItem.ID;
  397. }
  398. });
  399. if (needOtherClass) {
  400. const otherClassItem = curClassMap[otherClassName];
  401. otherClassItem.NextSiblingID = classData[0].ID;
  402. classData.push(otherClassItem);
  403. }
  404. const task = [
  405. priceInfoClassModel.insertMany(classData),
  406. priceInfoItemModel.insertMany(priceData)
  407. ];
  408. await Promise.all(task);
  409. // 将信息价源数据转换成信息价数据
  410. function transfromSourceItemToPriceItem(sourceItem, classID, name, noTaxPrice) {
  411. if (/(m³)|(m²)/.test(sourceItem.unit)) {
  412. sourceItem.unit = sourceItem.unit.replace(/m³/g, 'm3');
  413. sourceItem.unit = sourceItem.unit.replace(/m²/g, 'm2');
  414. }
  415. return {
  416. compilationID,
  417. period,
  418. name,
  419. noTaxPrice,
  420. classID,
  421. libID,
  422. areaID,
  423. ID: uuidV1(),
  424. code: sourceItem.subcid,
  425. unit: sourceItem.unit,
  426. specs: sourceItem.specs,
  427. taxPrice: sourceItem.taxPrice,
  428. remark: sourceItem.remark,
  429. };
  430. }
  431. }
  432. /**
  433. * 获取数据
  434. * @param {String} from - 从哪一期开始 eg: 2020-01
  435. * @param {String} to - 从哪一期结束 eg: 2020-05
  436. * @param {String} compilationID - 费用定额ID
  437. * @return {Object}
  438. */
  439. async function crawlData(from, to, compilationID) {
  440. const token = await getToken();
  441. const periods = getPeriodData(from, to);
  442. if (!periods) {
  443. throw '无效的期数区间。';
  444. }
  445. const classNameMap = await getClassNameMap(compilationID);
  446. if (!classNameMap) {
  447. throw '无有效的分类数据。';
  448. }
  449. const hintInfos = [];
  450. for (const period of periods) {
  451. const sourcePeriod = period.replace(/年|月/g, '');
  452. for (let i = 0; i < areas.length; i++) {
  453. // 存入地区
  454. const { city, county } = areas[i];
  455. const area = `${city}-${county}`;
  456. let areaItem = await priceInfoAreaModel.findOne({ compilationID, name: area }).lean();
  457. const serialNo = i + 1;
  458. if (!areaItem) {
  459. areaItem = { compilationID, serialNo, ID: uuidV1(), name: area };
  460. await priceInfoAreaModel.insertMany([areaItem]);
  461. } else { // 需求变更,需要排序
  462. await priceInfoAreaModel.update({ ID: areaItem.ID }, { $set: { serialNo } });
  463. }
  464. const existCount = await priceInfoItemModel.count({ compilationID, period, areaID: areaItem.ID });
  465. if (existCount) {
  466. continue;
  467. }
  468. // 存入信息价相关数据
  469. const sourceData = await getPriceInfoSource(token, sourcePeriod, city, county);
  470. if (typeof sourceData === 'string') {
  471. hintInfos.push(sourceData);
  472. continue;
  473. }
  474. if (!sourceData.length) {
  475. continue;
  476. }
  477. await saveData(compilationID, period, areaItem.ID, sourceData, classNameMap);
  478. }
  479. }
  480. if (hintInfos.length) {
  481. throw hintInfos.join('\n');
  482. }
  483. }