Browse Source

feat: 重庆2018信息价爬虫

vian 4 years ago
parent
commit
829260ddd8

+ 1 - 0
modules/price_info_lib/facade/index.js

@@ -69,6 +69,7 @@ async function crawlDataByCompilation(compilationID, from, to) {
         const crawler = require(`../../../web/over_write/crawler/${crawlURL}`);
         crawlData = crawler.crawlData;
     } catch (e) {
+        console.log(e);
         throw '该费用定额无可用爬虫方法。'
     }
     //await crawlData(from, to);

+ 104 - 32
web/over_write/crawler/chongqing_2018_price_crawler.js

@@ -6,7 +6,7 @@
  */
 
 module.exports = {
-  crawlData,  
+  crawlData,
 };
 
 const axios = require('axios');
@@ -75,7 +75,6 @@ const defaultAreas = [
   '潼南区',
   '荣昌区1',
   '荣昌区2',
-  '武隆区',
   '武隆区1',
   '武隆区2',
   '武隆区3',
@@ -94,7 +93,8 @@ const subAreaMap = {
     '大足区',
     '綦江区',
     '南川区',
-    '荣昌区',
+    '荣昌区1',
+    '荣昌区2',
     '铜梁区',
     '璧山区',
     '潼南区',
@@ -116,27 +116,41 @@ const subAreaMap = {
   ],
   '渝东南区': [
     '黔江区',
-    '武隆区',
+    '武隆区1',
+    '武隆区2',
+    '武隆区3',
+    '武隆区4',
+    '武隆区5',
+    '武隆区6',
     '石柱县',
-    '彭水县',
+    '彭水县1',
+    '彭水县2',
+    '彭水县3',
     '酉阳县',
     '秀山县',
   ],
 }
 
+const TIME_OUT = 60000;
+
 // 创建axios实例
 const axiosInstance = axios.create({
-  baseURL: 'http://www.cqsgczjxx.org/Service/MaterialPriceQuerySvr.svrx/',
+  baseURL: 'http://www.cqsgczjxx.org/',
   timeout: TIME_OUT,
+/*   proxy: {
+    host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error
+  }, */
   headers: {
       'Cache-Control': 'max-age=0',
-      'Content-Type': 'application/x-www-form-urlencoded',
-      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
-      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
+      'Accept': 'application/json, text/javascript, */*; q=0.01',
+      'X-Requested-With': 'XMLHttpRequest',
       'Accept-Encoding': 'gzip, deflate',
       'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
+      // 'Cookie': 'ASP.NET_SessionId=uozdrp0hep5x344vq153muju'
   },
-  responseType: 'document'
+  // responseType: 'json'
 });
 
 // 响应拦截器
@@ -175,10 +189,52 @@ function month2quarter(period) {
   return period;
 }
 
+function setTimeoutSync(handle, time) {
+  return new Promise((resolve, reject) => {
+      setTimeout(() => {
+          if (handle && typeof handle === 'function') {
+              handle();
+          }
+          resolve();
+      }, time);
+  });
+}
+
+// 目标网站做了反爬虫处理,请求需要携带cookie(sessionID),否则会报500错误
+let curCookie = '';
+
+async function getCookie() {
+  const indexRes = await axiosInstance.get('/Pages/CQZJW/index.aspx', null, { responseType: 'document' });
+  const cookies = indexRes.headers['set-cookie'];
+  return Object.prototype.toString.call(cookies) === '[object Array]'
+    ? cookies[0].split(';')[0]
+    : cookies || 'ASP.NET_SessionId=cbwzceh5pxzim13gyesho5af';
+}
+
+async function post(url, body) {
+  const cookie = curCookie ? curCookie : await getCookie();
+  curCookie = cookie;
+  const extendConfig = { headers: { Cookie: cookie } };
+  const serviceUrl = `/Service/MaterialPriceQuerySvr.svrx${url}`
+  let res = await axiosInstance.post(serviceUrl, querystring.stringify(body), extendConfig);
+  while (res && typeof res.data === 'string' && /<!doctype html>/.test(res.data)) {
+    // 有时候请求会返回302,需要重新发请求
+    await setTimeoutSync(null, 500);
+    res = await axiosInstance.post(serviceUrl, querystring.stringify(body), extendConfig);
+  }
+  if (typeof res.data === 'string' && /<!doctype html>/.test(res.data)) {
+    console.log(serviceUrl);
+    console.log(body);
+    console.log(res.data);
+    console.log('==================================')
+  }
+  return res;
+}
+
 // 获取材料价格
 async function queryPrice(period, area, groupType, classify) {
   const body = {
-    period,
+    period: period.replace('-', ''),
     area: area || '',
     groupType: groupType || '',
     classify: classify || '',
@@ -189,22 +245,33 @@ async function queryPrice(period, area, groupType, classify) {
     option: 0,
     token: ''
   };
-  const res = await axiosInstance.post('/QueryInfoPrice', querystring.stringify(body));
-  return res && res.Data && res.Data._Items || [];
+  const res =  await post('/QueryInfoPrice', body);
+  return res && res.data && res.data.Data && res.data.Data._Items || [];
 }
 
 // 获取地区信息
 async function queryArea(period, groupType) {
   const body = {
     groupType,
-    period,
+    period: period.replace('-', ''),
     token: ''
   };
-  const res = await axiosInstance.post('/QueryArea', querystring.stringify(body));
-  const areaData = res && res.Data && res.Data._Items || [];
+  const res =  await post('/QueryArea', body);
+  const areaData = res && res.data && res.data.Data && res.data.Data._Items || [];
   return areaData.map(item => item.Area);
 }
 
+// 获取分类信息
+async function queryKind(groupType, period) {
+  const body = {
+    groupType,
+    period: period.replace('-', ''),
+    token: ''
+  }
+  const res = await post('/QueryKind', body);
+  return res && res.data && res.data.Data || [];
+}
+
 // 爬取人工价格
 async function crawlLabour(period) {
   const groupType = '人工信息价';
@@ -213,13 +280,14 @@ async function crawlLabour(period) {
   const rst = [];
   for (const rootArea of areas) {
     const priceItems = await queryPrice(quater, rootArea, groupType);
+    priceItems.forEach(item => item.Unit = '工日');
     const subAreas = subAreaMap[rootArea];
     if (subAreas) {
       subAreas.forEach(area => {
         rst.push({ area, data: [{ classify: '人工', priceItems }] });
       });
     } else {
-      rst.push({ rootArea, data: [{ classify: '人工', priceItems }] });
+      rst.push({ area: rootArea, data: [{ classify: '人工', priceItems }] });
     }
   }
   return rst;
@@ -255,8 +323,8 @@ async function crawlBetonMaterial(period) {
 async function crawlBuldingMaterial(period) {
   const groupType = '建筑工程材料价格';
   // 根据期数获取建安工程材料分类
-  const { Data: kinds } = await axiosInstance.post('/QueryKind', querystring.stringify({ groupType, period, token: '' }));
-  if (kinds || !kinds.length) {
+  const kinds = await queryKind(groupType, period);
+  if (!kinds || !kinds.length) {
     return [];
   }
   const rst = [];
@@ -306,8 +374,8 @@ async function crawlGardenMateiral(period) {
       const isDuplicate = duplicateReg.test(item.TaxPrice) || duplicateReg.test(item.NoTaxPrice);
       if (isDuplicate) {
         // 分成最高低价最高价数据
-        const taxPriceList = item.TaxPrice.split('-');
-        const noTaxPriceList = item.NoTaxPrice.split('-');
+        const taxPriceList = item.TaxPrice ? item.TaxPrice.split('-') : [''];
+        const noTaxPriceList = item.NoTaxPrice ? item.NoTaxPrice.split('-') : [''];
         const minItem = {
           ...item,
           Name: `${item.Name}-最低价`,
@@ -333,8 +401,8 @@ async function crawlGardenMateiral(period) {
 async function crawlEnergyMateiral(period) {
   const groupType = '绿色、节能建筑材料价格';
   // 获取分类
-  const { Data: kinds } = await axiosInstance.post('/QueryKind', querystring.stringify({ groupType, period, token: '' }));
-  if (kinds || !kinds.length) {
+  const kinds = await queryKind(groupType, period);
+  if (!kinds || !kinds.length) {
     return [];
   }
   const rootClass = { classify: '绿色、节能建筑工程材料', subClass: [] };
@@ -351,8 +419,8 @@ async function crawlPrefabricatedMateiral(period) {
   const groupType = '装配式建筑材料价格';
   // 获取分类
   const quater = month2quarter(period);
-  const { Data: kinds } = await axiosInstance.post('/QueryKind', querystring.stringify({ groupType, period: quater, token: '' }));
-  if (kinds || !kinds.length) {
+  const kinds = await queryKind(groupType, quater);
+  if (!kinds || !kinds.length) {
     return [];
   }
   const rootClass = { classify: '装配式建筑工程成品构件', subClass: [] };
@@ -369,8 +437,8 @@ async function crawlTrackMateiral(period) {
   const groupType = '轨道材料价格';
   // 获取分类
   const quater = month2quarter(period);
-  const { Data: kinds } = await axiosInstance.post('/QueryKind', querystring.stringify({ groupType, period: quater, token: '' }));
-  if (kinds || !kinds.length) {
+  const kinds = await queryKind(groupType, quater);
+  if (!kinds || !kinds.length) {
     return [];
   }
   const rootClass = { classify: '城市轨道交通工程材料', subClass: [] };
@@ -484,16 +552,16 @@ async function save(allData, period, compilationID) {
   allData.forEach(({ area, data }) => {
     (areaMap[area] || (areaMap[area] = [])).push(...data);
   });
-  const libData = { period, compilationID, ID: v1(), name: `信息价${period}`, createDate: Date.now() };
+  const libData = { period, compilationID, ID: v1(), name: `信息价(${period})`, createDate: Date.now() };
   const curAreas = await priceInfoAreaModel.find({ compilationID }).sort({ serialNo: 1 }).lean();
-  const maxSerialNo = curAreas.length ? curAreas[curAreas.length - 1].serialNo || 0 : 0;
+  let maxSerialNo = curAreas.length ? curAreas[curAreas.length - 1].serialNo || 0 : 0;
   const areaData = [];
   const classData = [];
   const priceData = [];
   for (const area in areaMap) {
     let curArea = curAreas.find(cArea => cArea.name === area);
     if (!curArea) {
-      curArea = { compilationID, ID: v1(), serialNo: ++maxSerialNo };
+      curArea = { compilationID, ID: v1(), serialNo: ++maxSerialNo, name: area };
       areaData.push(curArea);
     }
     const data = areaMap[area];
@@ -505,7 +573,9 @@ async function save(allData, period, compilationID) {
       if (pre) {
         pre.classObj.NextSiblingID = item.classObj.ID;
       }
-      priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, item.classObj.ID, item.priceItems));
+      if (item.priceItems) {
+        priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, item.classObj.ID, item.priceItems));
+      }
       if (item.subClass && item.subClass.length) {
         item.subClass.forEach((child, cIndex) => {
           child.classObj = { libID: libData.ID, areaID: curArea.ID, ID: v1(), ParentID: item.classObj.ID, NextSiblingID: '-1', name: child.classify };
@@ -514,7 +584,9 @@ async function save(allData, period, compilationID) {
           if (preChild) {
             preChild.classObj.NextSiblingID = child.classObj.ID;
           }
-          priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, child.classObj.ID, child.priceItems));
+          if (child.priceItems) {
+            priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, child.classObj.ID, child.priceItems));
+          }
         });
       }
     });