| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649 | /** * @author vian * 重庆材料信息价爬虫 * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的数据是ssr的静态内容,因此不需要使用puppeteer。 * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据) */module.exports = {  crawlData,};const axios = require('axios');const querystring = require('querystring');const v1 = require('uuid/v1');const mongoose = require('mongoose');const _ = require('lodash');const priceInfoLibModel = mongoose.model('std_price_info_lib');const priceInfoClassModel = mongoose.model('std_price_info_class');const priceInfoItemModel = mongoose.model('std_price_info_items');const priceInfoAreaModel = mongoose.model('std_price_info_areas');const defaultAreas = [  '主城区',  '渝中区',  '江北区',  '沙坪坝区',  '南岸区',  '九龙坡区',  '大渡口区',  '北碚区',  '渝北区',  '巴南区',  '万州区',  '涪陵区',  '万盛区',  '双桥区',  '黔江区',  '长寿区',  '江津区',  '合川区',  '永川区',  '南川区',  '綦江县',  '潼南县',  '铜梁县',  '大足县',  '荣昌县',  '璧山县',  '梁平区',  '城口县',  '丰都县',  '垫江县',  '忠县',  '开州区',  '云阳县',  '奉节县',  '巫山县',  '巫溪县',  '石柱县',  '秀山县',  '酉阳县',  '彭水县',  '大足区',  '綦江区',  '万盛经开区',  '双桥经开区',  '铜梁区',  '璧山区',  '荣昌县1',  '荣昌县2',  '彭水县1',  '彭水县2',  '彭水县3',  '潼南区',  '荣昌区1',  '荣昌区2',  '武隆区1',  '武隆区2',  '武隆区3',  '武隆区4',  '武隆区5',  '武隆区6',];const subAreaMap = {  '渝西区': [    '涪陵区',    '长寿区',    '永川区',    '江津区',    '合川区',    '大足区',    '綦江区',    '南川区',    '荣昌区1',    '荣昌区2',    '铜梁区',    '璧山区',    '潼南区',    '双桥经开区',    '万盛经开区',  ],  '渝东北区': [    '万州区',    '开州区',    '梁平区',    '丰都县',    '垫江县',    '忠县',    '云阳县',    '奉节县',    '巫山县',    '巫溪县',    '城口县',  ],  '渝东南区': [    '黔江区',    '武隆区1',    '武隆区2',    '武隆区3',    '武隆区4',    '武隆区5',    '武隆区6',    '石柱县',    '彭水县1',    '彭水县2',    '彭水县3',    '酉阳县',    '秀山县',  ],}const TIME_OUT = 60000;// 创建axios实例const axiosInstance = axios.create({  baseURL: 'http://www.cqsgczjxx.org/',  timeout: TIME_OUT,  /*   proxy: {      host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error    }, */  headers: {    'Cache-Control': 'max-age=0',    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',    'Accept': 'application/json, text/javascript, */*; q=0.01',    'X-Requested-With': 'XMLHttpRequest',    'Accept-Encoding': 'gzip, deflate',    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',    // 'Cookie': 'ASP.NET_SessionId=uozdrp0hep5x344vq153muju'  },  // responseType: 'json'});// 响应拦截器axiosInstance.interceptors.response.use(function (response) {  return response;}, function (error) {  // 对响应错误做点什么  if (error.message.includes('timeout')) {    return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);  } else {    return Promise.reject(error);  }});// 将月期数转换为季度期数function month2quarter(period) {  if (!period || typeof period !== 'string') {    return null;  }  const quarter1 = /(01|02|03)月/;  const quarter2 = /(04|05|06)月/;  const quarter3 = /(07|08|09)月/;  const quarter4 = /(10|11|12)月/;  if (quarter1.test(period)) {    return period.replace(quarter1, '1季度');  }  if (quarter2.test(period)) {    return period.replace(quarter2, '2季度');  }  if (quarter3.test(period)) {    return period.replace(quarter3, '3季度');  }  if (quarter4.test(period)) {    return period.replace(quarter4, '4季度');  }  return period;}function setTimeoutSync(handle, time) {  return new Promise((resolve, reject) => {    setTimeout(() => {      if (handle && typeof handle === 'function') {        handle();      }      resolve();    }, time);  });}// 目标网站做了反爬虫处理,请求需要携带cookie(sessionID),否则会报500错误let curCookie = '';async function getCookie() {  const indexRes = await axiosInstance.get('/Pages/CQZJW/index.aspx', null, { responseType: 'document' });  const cookies = indexRes.headers['set-cookie'];  return Object.prototype.toString.call(cookies) === '[object Array]'    ? cookies[0].split(';')[0]    : cookies || 'ASP.NET_SessionId=cbwzceh5pxzim13gyesho5af';}async function post(url, body) {  const cookie = curCookie ? curCookie : await getCookie();  curCookie = cookie;  const extendConfig = { headers: { Cookie: cookie } };  const serviceUrl = `/Service/MaterialPriceQuerySvr.svrx${url}`  let res = await axiosInstance.post(serviceUrl, querystring.stringify(body), extendConfig);  while (res && typeof res.data === 'string' && /<!doctype html>/.test(res.data)) {    // 有时候请求会返回302,需要重新发请求    await setTimeoutSync(null, 500);    res = await axiosInstance.post(serviceUrl, querystring.stringify(body), extendConfig);  }  if (typeof res.data === 'string' && /<!doctype html>/.test(res.data)) {    console.log(serviceUrl);    console.log(body);    console.log(res.data);    console.log('==================================')  }  return res;}// 获取材料价格async function queryPrice(period, area, groupType, classify) {  const body = {    period: period.replace('-', ''),    area: area || '',    groupType: groupType || '',    classify: classify || '',    priceType: '',    searchParam: '',    pageIndex: 1,    pageSize: 10000,    option: 0,    token: ''  };  const res = await post('/QueryInfoPrice', body);  return res && res.data && res.data.Data && res.data.Data._Items || [];}// 获取地区信息async function queryArea(period, groupType) {  const body = {    groupType,    period: period.replace('-', ''),    token: ''  };  const res = await post('/QueryArea', body);  const areaData = res && res.data && res.data.Data && res.data.Data._Items || [];  return areaData.map(item => item.Area);}// 获取分类信息async function queryKind(groupType, period) {  const body = {    groupType,    period: period.replace('-', ''),    token: ''  }  const res = await post('/QueryKind', body);  return res && res.data && res.data.Data || [];}// 爬取人工价格async function crawlLabour(period) {  const groupType = '人工信息价';  const quater = month2quarter(period);  const areas = await queryArea(quater, groupType);  const rst = [];  for (const rootArea of areas) {    const priceItems = await queryPrice(quater, rootArea, groupType);    priceItems.forEach(item => item.Unit = '工日');    const subAreas = subAreaMap[rootArea];    if (subAreas) {      subAreas.forEach(area => {        rst.push({ area, data: [{ classify: '人工', priceItems }] });      });    } else {      rst.push({ area: rootArea, data: [{ classify: '人工', priceItems }] });    }  }  return rst;}// 爬取地方材料信息价async function crawlLocaleMaterial(period) {  const groupType = '区县材料价格';  const areas = await queryArea(period, groupType);  const rst = [];  for (const area of areas) {    const priceItems = await queryPrice(period, area, groupType);    const item = { area, data: [{ classify: '地方材料信息价', priceItems }] };    rst.push(item);  }  return rst;}// 爬取预拌砂浆息价async function crawlBetonMaterial(period) {  const groupType = '预拌砂浆价格';  const areas = await queryArea(period, groupType);  const rst = [];  for (const area of areas) {    const priceItems = await queryPrice(period, area, groupType);    const item = { area, data: [{ classify: '预拌商品砂浆', priceItems }] };    rst.push(item);  }  return rst;}// 爬取建安工程材料async function crawlBuldingMaterial(period) {  const groupType = '建筑工程材料价格';  // 根据期数获取建安工程材料分类  const kinds = await queryKind(groupType, period);  if (!kinds || !kinds.length) {    return [];  }  const rst = [];  for (const kind of kinds) {    const priceItems = await queryPrice(period, '', groupType, kind);    rst.push({ classify: kind, priceItems, subClass: [] });  }  return rst;}// 爬取园林绿化工程材料async function crawlGardenMateiral(period) {  const priceItems = await queryPrice(period, '', '城市园林绿化工程材料价格', '');  // 数据 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 不含税价(元) = ‘’ | 14-17 | 大于400 | 200-300 | 430-780  // 则此数据需要分为:  // 1. { name: 名称-最低价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 430 }  // 2. { name: 名称-最高价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 780 }  if (!priceItems.length) {    return [];  }  const rootClass = { classify: '苗木', subClass: [] };  const rst = [rootClass];  const groupedData = _.groupBy(priceItems, 'Family');  const unit = 'CM';  const duplicateReg = /-/;  Object    .entries(groupedData)    .forEach(([kind, items]) => {      const classItem = { classify: kind, priceItems: [], subClass: [] };      rootClass.subClass.push(classItem);      items.forEach(item => {        // 拼接规格型号        const specsList = [];        if (item.Height) {          specsList.push(`高度${item.Height}${unit}`);        }        if (item.TrunkDiameter) {          specsList.push(`干径${item.TrunkDiameter}${unit}`);        }        if (item.TopDiameter) {          specsList.push(`冠径${item.TopDiameter}${unit}`);        }        if (item.BranchHeight) {          specsList.push(`分枝高${item.BranchHeight}${unit}`);        }        item.Model = specsList.join(' ');        const isDuplicate = duplicateReg.test(item.TaxPrice) || duplicateReg.test(item.NoTaxPrice);        if (isDuplicate) {          // 分成最高低价最高价数据          const taxPriceList = item.TaxPrice ? item.TaxPrice.split('-') : [''];          const noTaxPriceList = item.NoTaxPrice ? item.NoTaxPrice.split('-') : [''];          const minItem = {            ...item,            Name: `${item.Name}-最低价`,            TaxPrice: taxPriceList[0],            NoTaxPrice: noTaxPriceList[0]          };          const maxItem = {            ...item,            Name: `${item.Name}-最高价`,            TaxPrice: taxPriceList[1] || '',            NoTaxPrice: noTaxPriceList[1] || ''          };          classItem.priceItems.push(minItem, maxItem);        } else {          classItem.priceItems.push(item);        }      });    });  return rst;}// 爬取绿色、节能建筑工程材料async function crawlEnergyMateiral(period) {  const groupType = '绿色、节能建筑材料价格';  // 获取分类  const kinds = await queryKind(groupType, period);  if (!kinds || !kinds.length) {    return [];  }  const rootClass = { classify: '绿色、节能建筑工程材料', subClass: [] };  const rst = [rootClass];  for (const kind of kinds) {    const priceItems = await queryPrice(period, '', groupType, kind);    rootClass.subClass.push({ classify: kind, priceItems, subClass: [] });  }  return rst;}// 爬取装配式建筑工程成品构件async function crawlPrefabricatedMateiral(period) {  const groupType = '装配式建筑材料价格';  // 获取分类  const quater = month2quarter(period);  const kinds = await queryKind(groupType, quater);  if (!kinds || !kinds.length) {    return [];  }  const rootClass = { classify: '装配式建筑工程成品构件', subClass: [] };  const rst = [rootClass];  for (const kind of kinds) {    const priceItems = await queryPrice(quater, '', groupType, kind);    rootClass.subClass.push({ classify: kind, priceItems, subClass: [] });  }  return rst;}// 爬取轨道材料async function crawlTrackMateiral(period) {  const groupType = '轨道材料价格';  // 获取分类  const quater = month2quarter(period);  const kinds = await queryKind(groupType, quater);  if (!kinds || !kinds.length) {    return [];  }  const rootClass = { classify: '城市轨道交通工程材料', subClass: [] };  const rst = [rootClass];  for (const kind of kinds) {    const priceItems = await queryPrice(quater, '', groupType, kind);    rootClass.subClass.push({ classify: kind, priceItems, subClass: [] });  }  return rst;}// 爬取主要材料信息价async function crawlGeneralMaterial(period) {  const area = '通用';  const buildingMaterial = await crawlBuldingMaterial(period);  const gardenMaterial = await crawlGardenMateiral(period)  const energyMaterial = await crawlEnergyMateiral(period)  const prefMaterial = await crawlPrefabricatedMateiral(period)  const trackMaterial = await crawlTrackMateiral(period);  return [{ area, data: [...buildingMaterial, ...gardenMaterial, ...energyMaterial, ...prefMaterial, ...trackMaterial] }];}/** * 获取期数数据 * @param {String} from - 从哪一期开始 eg: 2020-01 * @param {String} to - 从哪一期结束 eg: 2020-05 * @return {Array<string> | null} ['2020年12月'] */function getPeriodData(from, to) {  if (from > to) {    return null;  }  // 根据区间获取期数列表  const reg = /(\d+)-(\d+)/;  const fromMatch = from.match(reg);  const fromYear = +fromMatch[1];  const fromMonth = +fromMatch[2];  const toMatch = to.match(reg);  const toYear = +toMatch[1];  const toMonth = +toMatch[2];  let curYear = fromYear;  let curMonth = fromMonth;  const list = [];  const monthMap = {    '1': '01月',    '2': '02月',    '3': '03月',    '4': '04月',    '5': '05月',    '6': '06月',    '7': '07月',    '8': '08月',    '9': '09月',    '10': '10月',    '11': '11月',    '12': '12月',  };  while (curYear <= toYear && curMonth <= toMonth) {    list.push(`${curYear}年-${monthMap[curMonth]}`);    if (curMonth === 12) {      curYear++;      curMonth = 1;    } else {      curMonth++;    }  }  return list;}// 地区serialNo补丁async function areaPatch(compilationID) {  const areaData = await priceInfoAreaModel.find({ compilationID, serialNo: null }).lean();  const bulks = [];  areaData.forEach(areaItem => {    const serialNo = defaultAreas.indexOf(areaItem.name) + 1;    bulks.push({      updateOne: {        filter: { ID: areaItem.ID },        update: { serialNo }      }    });  });  if (bulks.length) {    await priceInfoAreaModel.bulkWrite(bulks);  }}function transformPriceItems(period, compilationID, libID, areaID, classID, items) {  return items.map(item => ({    period,    compilationID,    libID,    areaID,    classID,    ID: v1(),    code: '',    name: item.Name,    specs: item.Model,    unit: item.Unit,    taxPrice: item.TaxPrice,    noTaxPrice: item.NoTaxPrice,    remark: item.Remark,  }));}// 转换爬取出来的原始数据并入库async function save(allData, period, compilationID) {  // 将各部分数据按照地区进行合并  const areaMap = {};  allData.forEach(({ area, data }) => {    const areaName = `重庆市-${area}`;    (areaMap[areaName] || (areaMap[areaName] = [])).push(...data);  });  const libData = { period, compilationID, ID: v1(), name: `信息价(${period})`, createDate: Date.now() };  const curAreas = await priceInfoAreaModel.find({ compilationID }).sort({ serialNo: 1 }).lean();  let maxSerialNo = curAreas.length ? curAreas[curAreas.length - 1].serialNo || 0 : 0;  const areaData = [];  const classData = [];  const priceData = [];  for (const area in areaMap) {    let curArea = curAreas.find(cArea => cArea.name === area);    if (!curArea) {      curArea = { compilationID, ID: v1(), serialNo: ++maxSerialNo, name: area };      areaData.push(curArea);    }    const data = areaMap[area];    // class最多只有两层    data.forEach((item, index) => {      item.classObj = { libID: libData.ID, areaID: curArea.ID, ID: v1(), ParentID: '-1', NextSiblingID: '-1', name: item.classify };      classData.push(item.classObj);      const pre = data[index - 1];      if (pre) {        pre.classObj.NextSiblingID = item.classObj.ID;      }      if (item.priceItems) {        priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, item.classObj.ID, item.priceItems));      }      if (item.subClass && item.subClass.length) {        item.subClass.forEach((child, cIndex) => {          child.classObj = { libID: libData.ID, areaID: curArea.ID, ID: v1(), ParentID: item.classObj.ID, NextSiblingID: '-1', name: child.classify };          classData.push(child.classObj);          const preChild = item.subClass[cIndex - 1];          if (preChild) {            preChild.classObj.NextSiblingID = child.classObj.ID;          }          if (child.priceItems) {            priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, child.classObj.ID, child.priceItems));          }        });      }    });  }  if (areaData.length) {    await priceInfoAreaModel.insertMany(areaData);  }  if (classData.length) {    await priceInfoClassModel.insertMany(classData);  }  if (priceData.length) {    await priceInfoItemModel.insertMany(priceData);  }  await priceInfoLibModel.insertMany([libData]);}/** * 爬取数据 * @param {String} from - 从哪一期开始 eg: 2020-01 * @param {String} to - 从哪一期结束 eg: 2020-05 * @param {String} compilationID - 费用定额ID * @return {Object} */async function crawlData(from, to, compilationID) {  let curPeriod;  try {    const periods = getPeriodData(from, to);    if (!periods || !periods.length) {      throw '无效的期数区间。';    }    // 地区补丁    await areaPatch(compilationID);    // 一期一期爬取数据    for (const period of periods) {      const labourData = await crawlLabour(period);      const localeData = await crawlLocaleMaterial(period);      const betonData = await crawlBetonMaterial(period);      const generalData = await crawlGeneralMaterial(period);      const allData = [...labourData, ...localeData, ...betonData, ...generalData];      if (!allData.length) {        throw `${period}无有效数据`;      }      await save(allData, period, compilationID);      curPeriod = period;    }  } catch (err) {    console.log(err);    // 错误时提示已经成功爬取的期数    let errTip = '';    if (curPeriod) {      errTip += `\n成功爬取期数为:${periods[0]}到${curPeriod}`;    }    const errStr = String(err) + errTip;    console.log(`err`);    console.log(errStr);    throw errStr;  }}
 |