123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649 |
- /**
- * @author vian
- * 重庆材料信息价爬虫
- * 由于headless chrome “puppeteer”占用资源比较大,且材料信息价的数据是ssr的静态内容,因此不需要使用puppeteer。
- * 数据获取使用cheerio(解析html,可用类jquery语法操作生成的数据)
- */
- module.exports = {
- crawlData,
- };
- const axios = require('axios');
- const querystring = require('querystring');
- const v1 = require('uuid/v1');
- const mongoose = require('mongoose');
- const _ = require('lodash');
- const priceInfoLibModel = mongoose.model('std_price_info_lib');
- const priceInfoClassModel = mongoose.model('std_price_info_class');
- const priceInfoItemModel = mongoose.model('std_price_info_items');
- const priceInfoAreaModel = mongoose.model('std_price_info_areas');
- const defaultAreas = [
- '主城区',
- '渝中区',
- '江北区',
- '沙坪坝区',
- '南岸区',
- '九龙坡区',
- '大渡口区',
- '北碚区',
- '渝北区',
- '巴南区',
- '万州区',
- '涪陵区',
- '万盛区',
- '双桥区',
- '黔江区',
- '长寿区',
- '江津区',
- '合川区',
- '永川区',
- '南川区',
- '綦江县',
- '潼南县',
- '铜梁县',
- '大足县',
- '荣昌县',
- '璧山县',
- '梁平区',
- '城口县',
- '丰都县',
- '垫江县',
- '忠县',
- '开州区',
- '云阳县',
- '奉节县',
- '巫山县',
- '巫溪县',
- '石柱县',
- '秀山县',
- '酉阳县',
- '彭水县',
- '大足区',
- '綦江区',
- '万盛经开区',
- '双桥经开区',
- '铜梁区',
- '璧山区',
- '荣昌县1',
- '荣昌县2',
- '彭水县1',
- '彭水县2',
- '彭水县3',
- '潼南区',
- '荣昌区1',
- '荣昌区2',
- '武隆区1',
- '武隆区2',
- '武隆区3',
- '武隆区4',
- '武隆区5',
- '武隆区6',
- ];
- const subAreaMap = {
- '渝西区': [
- '涪陵区',
- '长寿区',
- '永川区',
- '江津区',
- '合川区',
- '大足区',
- '綦江区',
- '南川区',
- '荣昌区1',
- '荣昌区2',
- '铜梁区',
- '璧山区',
- '潼南区',
- '双桥经开区',
- '万盛经开区',
- ],
- '渝东北区': [
- '万州区',
- '开州区',
- '梁平区',
- '丰都县',
- '垫江县',
- '忠县',
- '云阳县',
- '奉节县',
- '巫山县',
- '巫溪县',
- '城口县',
- ],
- '渝东南区': [
- '黔江区',
- '武隆区1',
- '武隆区2',
- '武隆区3',
- '武隆区4',
- '武隆区5',
- '武隆区6',
- '石柱县',
- '彭水县1',
- '彭水县2',
- '彭水县3',
- '酉阳县',
- '秀山县',
- ],
- }
- const TIME_OUT = 60000;
- // 创建axios实例
- const axiosInstance = axios.create({
- baseURL: 'http://www.cqsgczjxx.org/',
- timeout: TIME_OUT,
- /* proxy: {
- host: "127.0.0.1", port: "8888" // Fiddler抓包,需要打开Fiddler否则会报connect error
- }, */
- headers: {
- 'Cache-Control': 'max-age=0',
- 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
- 'Accept': 'application/json, text/javascript, */*; q=0.01',
- 'X-Requested-With': 'XMLHttpRequest',
- 'Accept-Encoding': 'gzip, deflate',
- 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
- // 'Cookie': 'ASP.NET_SessionId=uozdrp0hep5x344vq153muju'
- },
- // responseType: 'json'
- });
- // 响应拦截器
- axiosInstance.interceptors.response.use(function (response) {
- return response;
- }, function (error) {
- // 对响应错误做点什么
- if (error.message.includes('timeout')) {
- return Promise.reject(`目标网络超时,请稍后再试。(${TIME_OUT}ms)`);
- } else {
- return Promise.reject(error);
- }
- });
- // 将月期数转换为季度期数
- function month2quarter(period) {
- if (!period || typeof period !== 'string') {
- return null;
- }
- const quarter1 = /(01|02|03)月/;
- const quarter2 = /(04|05|06)月/;
- const quarter3 = /(07|08|09)月/;
- const quarter4 = /(10|11|12)月/;
- if (quarter1.test(period)) {
- return period.replace(quarter1, '1季度');
- }
- if (quarter2.test(period)) {
- return period.replace(quarter2, '2季度');
- }
- if (quarter3.test(period)) {
- return period.replace(quarter3, '3季度');
- }
- if (quarter4.test(period)) {
- return period.replace(quarter4, '4季度');
- }
- return period;
- }
- function setTimeoutSync(handle, time) {
- return new Promise((resolve, reject) => {
- setTimeout(() => {
- if (handle && typeof handle === 'function') {
- handle();
- }
- resolve();
- }, time);
- });
- }
- // 目标网站做了反爬虫处理,请求需要携带cookie(sessionID),否则会报500错误
- let curCookie = '';
- async function getCookie() {
- const indexRes = await axiosInstance.get('/Pages/CQZJW/index.aspx', null, { responseType: 'document' });
- const cookies = indexRes.headers['set-cookie'];
- return Object.prototype.toString.call(cookies) === '[object Array]'
- ? cookies[0].split(';')[0]
- : cookies || 'ASP.NET_SessionId=cbwzceh5pxzim13gyesho5af';
- }
- async function post(url, body) {
- const cookie = curCookie ? curCookie : await getCookie();
- curCookie = cookie;
- const extendConfig = { headers: { Cookie: cookie } };
- const serviceUrl = `/Service/MaterialPriceQuerySvr.svrx${url}`
- let res = await axiosInstance.post(serviceUrl, querystring.stringify(body), extendConfig);
- while (res && typeof res.data === 'string' && /<!doctype html>/.test(res.data)) {
- // 有时候请求会返回302,需要重新发请求
- await setTimeoutSync(null, 500);
- res = await axiosInstance.post(serviceUrl, querystring.stringify(body), extendConfig);
- }
- if (typeof res.data === 'string' && /<!doctype html>/.test(res.data)) {
- console.log(serviceUrl);
- console.log(body);
- console.log(res.data);
- console.log('==================================')
- }
- return res;
- }
- // 获取材料价格
- async function queryPrice(period, area, groupType, classify) {
- const body = {
- period: period.replace('-', ''),
- area: area || '',
- groupType: groupType || '',
- classify: classify || '',
- priceType: '',
- searchParam: '',
- pageIndex: 1,
- pageSize: 10000,
- option: 0,
- token: ''
- };
- const res = await post('/QueryInfoPrice', body);
- return res && res.data && res.data.Data && res.data.Data._Items || [];
- }
- // 获取地区信息
- async function queryArea(period, groupType) {
- const body = {
- groupType,
- period: period.replace('-', ''),
- token: ''
- };
- const res = await post('/QueryArea', body);
- const areaData = res && res.data && res.data.Data && res.data.Data._Items || [];
- return areaData.map(item => item.Area);
- }
- // 获取分类信息
- async function queryKind(groupType, period) {
- const body = {
- groupType,
- period: period.replace('-', ''),
- token: ''
- }
- const res = await post('/QueryKind', body);
- return res && res.data && res.data.Data || [];
- }
- // 爬取人工价格
- async function crawlLabour(period) {
- const groupType = '人工信息价';
- const quater = month2quarter(period);
- const areas = await queryArea(quater, groupType);
- const rst = [];
- for (const rootArea of areas) {
- const priceItems = await queryPrice(quater, rootArea, groupType);
- priceItems.forEach(item => item.Unit = '工日');
- const subAreas = subAreaMap[rootArea];
- if (subAreas) {
- subAreas.forEach(area => {
- rst.push({ area, data: [{ classify: '人工', priceItems }] });
- });
- } else {
- rst.push({ area: rootArea, data: [{ classify: '人工', priceItems }] });
- }
- }
- return rst;
- }
- // 爬取地方材料信息价
- async function crawlLocaleMaterial(period) {
- const groupType = '区县材料价格';
- const areas = await queryArea(period, groupType);
- const rst = [];
- for (const area of areas) {
- const priceItems = await queryPrice(period, area, groupType);
- const item = { area, data: [{ classify: '地方材料信息价', priceItems }] };
- rst.push(item);
- }
- return rst;
- }
- // 爬取预拌砂浆息价
- async function crawlBetonMaterial(period) {
- const groupType = '预拌砂浆价格';
- const areas = await queryArea(period, groupType);
- const rst = [];
- for (const area of areas) {
- const priceItems = await queryPrice(period, area, groupType);
- const item = { area, data: [{ classify: '预拌商品砂浆', priceItems }] };
- rst.push(item);
- }
- return rst;
- }
- // 爬取建安工程材料
- async function crawlBuldingMaterial(period) {
- const groupType = '建筑工程材料价格';
- // 根据期数获取建安工程材料分类
- const kinds = await queryKind(groupType, period);
- if (!kinds || !kinds.length) {
- return [];
- }
- const rst = [];
- for (const kind of kinds) {
- const priceItems = await queryPrice(period, '', groupType, kind);
- rst.push({ classify: kind, priceItems, subClass: [] });
- }
- return rst;
- }
- // 爬取园林绿化工程材料
- async function crawlGardenMateiral(period) {
- const priceItems = await queryPrice(period, '', '城市园林绿化工程材料价格', '');
- // 数据 高度(CM) | 干径(CM) | 冠径(CM) | 分枝高(CM) | 不含税价(元) = ‘’ | 14-17 | 大于400 | 200-300 | 430-780
- // 则此数据需要分为:
- // 1. { name: 名称-最低价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 430 }
- // 2. { name: 名称-最高价, specs: 干径14-17CM 冠径大于400CM 分枝高200-300CM, noTaxPrice: 780 }
- if (!priceItems.length) {
- return [];
- }
- const rootClass = { classify: '苗木', subClass: [] };
- const rst = [rootClass];
- const groupedData = _.groupBy(priceItems, 'Family');
- const unit = 'CM';
- const duplicateReg = /-/;
- Object
- .entries(groupedData)
- .forEach(([kind, items]) => {
- const classItem = { classify: kind, priceItems: [], subClass: [] };
- rootClass.subClass.push(classItem);
- items.forEach(item => {
- // 拼接规格型号
- const specsList = [];
- if (item.Height) {
- specsList.push(`高度${item.Height}${unit}`);
- }
- if (item.TrunkDiameter) {
- specsList.push(`干径${item.TrunkDiameter}${unit}`);
- }
- if (item.TopDiameter) {
- specsList.push(`冠径${item.TopDiameter}${unit}`);
- }
- if (item.BranchHeight) {
- specsList.push(`分枝高${item.BranchHeight}${unit}`);
- }
- item.Model = specsList.join(' ');
- const isDuplicate = duplicateReg.test(item.TaxPrice) || duplicateReg.test(item.NoTaxPrice);
- if (isDuplicate) {
- // 分成最高低价最高价数据
- const taxPriceList = item.TaxPrice ? item.TaxPrice.split('-') : [''];
- const noTaxPriceList = item.NoTaxPrice ? item.NoTaxPrice.split('-') : [''];
- const minItem = {
- ...item,
- Name: `${item.Name}-最低价`,
- TaxPrice: taxPriceList[0],
- NoTaxPrice: noTaxPriceList[0]
- };
- const maxItem = {
- ...item,
- Name: `${item.Name}-最高价`,
- TaxPrice: taxPriceList[1] || '',
- NoTaxPrice: noTaxPriceList[1] || ''
- };
- classItem.priceItems.push(minItem, maxItem);
- } else {
- classItem.priceItems.push(item);
- }
- });
- });
- return rst;
- }
- // 爬取绿色、节能建筑工程材料
- async function crawlEnergyMateiral(period) {
- const groupType = '绿色、节能建筑材料价格';
- // 获取分类
- const kinds = await queryKind(groupType, period);
- if (!kinds || !kinds.length) {
- return [];
- }
- const rootClass = { classify: '绿色、节能建筑工程材料', subClass: [] };
- const rst = [rootClass];
- for (const kind of kinds) {
- const priceItems = await queryPrice(period, '', groupType, kind);
- rootClass.subClass.push({ classify: kind, priceItems, subClass: [] });
- }
- return rst;
- }
- // 爬取装配式建筑工程成品构件
- async function crawlPrefabricatedMateiral(period) {
- const groupType = '装配式建筑材料价格';
- // 获取分类
- const quater = month2quarter(period);
- const kinds = await queryKind(groupType, quater);
- if (!kinds || !kinds.length) {
- return [];
- }
- const rootClass = { classify: '装配式建筑工程成品构件', subClass: [] };
- const rst = [rootClass];
- for (const kind of kinds) {
- const priceItems = await queryPrice(quater, '', groupType, kind);
- rootClass.subClass.push({ classify: kind, priceItems, subClass: [] });
- }
- return rst;
- }
- // 爬取轨道材料
- async function crawlTrackMateiral(period) {
- const groupType = '轨道材料价格';
- // 获取分类
- const quater = month2quarter(period);
- const kinds = await queryKind(groupType, quater);
- if (!kinds || !kinds.length) {
- return [];
- }
- const rootClass = { classify: '城市轨道交通工程材料', subClass: [] };
- const rst = [rootClass];
- for (const kind of kinds) {
- const priceItems = await queryPrice(quater, '', groupType, kind);
- rootClass.subClass.push({ classify: kind, priceItems, subClass: [] });
- }
- return rst;
- }
- // 爬取主要材料信息价
- async function crawlGeneralMaterial(period) {
- const area = '通用';
- const buildingMaterial = await crawlBuldingMaterial(period);
- const gardenMaterial = await crawlGardenMateiral(period)
- const energyMaterial = await crawlEnergyMateiral(period)
- const prefMaterial = await crawlPrefabricatedMateiral(period)
- const trackMaterial = await crawlTrackMateiral(period);
- return [{ area, data: [...buildingMaterial, ...gardenMaterial, ...energyMaterial, ...prefMaterial, ...trackMaterial] }];
- }
- /**
- * 获取期数数据
- * @param {String} from - 从哪一期开始 eg: 2020-01
- * @param {String} to - 从哪一期结束 eg: 2020-05
- * @return {Array<string> | null} ['2020年12月']
- */
- function getPeriodData(from, to) {
- if (from > to) {
- return null;
- }
- // 根据区间获取期数列表
- const reg = /(\d+)-(\d+)/;
- const fromMatch = from.match(reg);
- const fromYear = +fromMatch[1];
- const fromMonth = +fromMatch[2];
- const toMatch = to.match(reg);
- const toYear = +toMatch[1];
- const toMonth = +toMatch[2];
- let curYear = fromYear;
- let curMonth = fromMonth;
- const list = [];
- const monthMap = {
- '1': '01月',
- '2': '02月',
- '3': '03月',
- '4': '04月',
- '5': '05月',
- '6': '06月',
- '7': '07月',
- '8': '08月',
- '9': '09月',
- '10': '10月',
- '11': '11月',
- '12': '12月',
- };
- while (curYear <= toYear && curMonth <= toMonth) {
- list.push(`${curYear}年-${monthMap[curMonth]}`);
- if (curMonth === 12) {
- curYear++;
- curMonth = 1;
- } else {
- curMonth++;
- }
- }
- return list;
- }
- // 地区serialNo补丁
- async function areaPatch(compilationID) {
- const areaData = await priceInfoAreaModel.find({ compilationID, serialNo: null }).lean();
- const bulks = [];
- areaData.forEach(areaItem => {
- const serialNo = defaultAreas.indexOf(areaItem.name) + 1;
- bulks.push({
- updateOne: {
- filter: { ID: areaItem.ID },
- update: { serialNo }
- }
- });
- });
- if (bulks.length) {
- await priceInfoAreaModel.bulkWrite(bulks);
- }
- }
- function transformPriceItems(period, compilationID, libID, areaID, classID, items) {
- return items.map(item => ({
- period,
- compilationID,
- libID,
- areaID,
- classID,
- ID: v1(),
- code: '',
- name: item.Name,
- specs: item.Model,
- unit: item.Unit,
- taxPrice: item.TaxPrice,
- noTaxPrice: item.NoTaxPrice,
- remark: item.Remark,
- }));
- }
- // 转换爬取出来的原始数据并入库
- async function save(allData, period, compilationID) {
- // 将各部分数据按照地区进行合并
- const areaMap = {};
- allData.forEach(({ area, data }) => {
- const areaName = `重庆市-${area}`;
- (areaMap[areaName] || (areaMap[areaName] = [])).push(...data);
- });
- const libData = { period, compilationID, ID: v1(), name: `信息价(${period})`, createDate: Date.now() };
- const curAreas = await priceInfoAreaModel.find({ compilationID }).sort({ serialNo: 1 }).lean();
- let maxSerialNo = curAreas.length ? curAreas[curAreas.length - 1].serialNo || 0 : 0;
- const areaData = [];
- const classData = [];
- const priceData = [];
- for (const area in areaMap) {
- let curArea = curAreas.find(cArea => cArea.name === area);
- if (!curArea) {
- curArea = { compilationID, ID: v1(), serialNo: ++maxSerialNo, name: area };
- areaData.push(curArea);
- }
- const data = areaMap[area];
- // class最多只有两层
- data.forEach((item, index) => {
- item.classObj = { libID: libData.ID, areaID: curArea.ID, ID: v1(), ParentID: '-1', NextSiblingID: '-1', name: item.classify };
- classData.push(item.classObj);
- const pre = data[index - 1];
- if (pre) {
- pre.classObj.NextSiblingID = item.classObj.ID;
- }
- if (item.priceItems) {
- priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, item.classObj.ID, item.priceItems));
- }
- if (item.subClass && item.subClass.length) {
- item.subClass.forEach((child, cIndex) => {
- child.classObj = { libID: libData.ID, areaID: curArea.ID, ID: v1(), ParentID: item.classObj.ID, NextSiblingID: '-1', name: child.classify };
- classData.push(child.classObj);
- const preChild = item.subClass[cIndex - 1];
- if (preChild) {
- preChild.classObj.NextSiblingID = child.classObj.ID;
- }
- if (child.priceItems) {
- priceData.push(...transformPriceItems(period, compilationID, libData.ID, curArea.ID, child.classObj.ID, child.priceItems));
- }
- });
- }
- });
- }
- if (areaData.length) {
- await priceInfoAreaModel.insertMany(areaData);
- }
- if (classData.length) {
- await priceInfoClassModel.insertMany(classData);
- }
- if (priceData.length) {
- await priceInfoItemModel.insertMany(priceData);
- }
- await priceInfoLibModel.insertMany([libData]);
- }
- /**
- * 爬取数据
- * @param {String} from - 从哪一期开始 eg: 2020-01
- * @param {String} to - 从哪一期结束 eg: 2020-05
- * @param {String} compilationID - 费用定额ID
- * @return {Object}
- */
- async function crawlData(from, to, compilationID) {
- let curPeriod;
- try {
- const periods = getPeriodData(from, to);
- if (!periods || !periods.length) {
- throw '无效的期数区间。';
- }
- // 地区补丁
- await areaPatch(compilationID);
- // 一期一期爬取数据
- for (const period of periods) {
- const labourData = await crawlLabour(period);
- const localeData = await crawlLocaleMaterial(period);
- const betonData = await crawlBetonMaterial(period);
- const generalData = await crawlGeneralMaterial(period);
- const allData = [...labourData, ...localeData, ...betonData, ...generalData];
- if (!allData.length) {
- throw `${period}无有效数据`;
- }
- await save(allData, period, compilationID);
- curPeriod = period;
- }
- } catch (err) {
- console.log(err);
- // 错误时提示已经成功爬取的期数
- let errTip = '';
- if (curPeriod) {
- errTip += `\n成功爬取期数为:${periods[0]}到${curPeriod}`;
- }
- const errStr = String(err) + errTip;
- console.log(`err`);
- console.log(errStr);
- throw errStr;
- }
- }
|