爬了杭州的租房数据,原来……( 三 )

<= 1:print('------parse, no data in ', meta['province_name'], meta['city_name'])else:for a_tag in a_tag_list:meta['area_name'] = self.common_util.get_extract(a_tag.xpath('text()'))meta['area_url'] = self.common_util.get_extract(a_tag.xpath('@href'))meta['base_url'] = urlif meta['area_name'] is not None and meta['area_name'] != '' and meta['area_name'] != '不限':print(url + meta['area_url'])yield scrapy.FormRequest(url=url + meta['area_url'], method='GET', meta=meta, callback=self.parse_area)def parse_area(self, response):try:body = response.body.decode('gb18030').encode('utf-8')except UnicodeDecodeError as e:print(e)body = response.bodymeta = response.metaurl = response.urlhxf = Selector(text=body)dl_tag_list = hxf.xpath('//div[@class="d897-8936-3fa8-2e8e houseList"]/dl')print('dl_tag_list len: ', len(dl_tag_list))if dl_tag_list is None or len(dl_tag_list) <= 1:print('------parse_area, no data in ', meta['province_name'], meta['city_name'], meta['area_name'])else:for dl_tag in dl_tag_list:feature = ''feature_span_list = dl_tag.xpath('dd/p[5]/span')for feature_span in feature_span_list:feature += self.common_util.get_extract(feature_span.xpath('text()')) + ','feature = feature[:-1] if len(feature) > 1 else featureself.persistent_data.append({'city_index_id': meta['city_index_id'],'province_name': meta['province_name'],'city_name': meta['city_name'],'area_name': meta['area_name'],'detail_url': self.common_util.get_extract(dl_tag.xpath('dd/p[1]/a/@href')),'name': self.common_util.get_extract(dl_tag.xpath('dd/p[1]/a/text()')),'rent_way': self.common_util.get_extract(dl_tag.xpath('dd/p[2]/text()[1]')),'door_model': self.common_util.get_extract(dl_tag.xpath('dd/p[2]/text()[2]')),'area': self.common_util.get_extract(dl_tag.xpath('dd/p[2]/text()[3]')),'toward': self.common_util.get_extract(dl_tag.xpath('dd/p[2]/text()[4]')),'unit_price': self.common_util.get_extract(dl_tag.xpath('dd//span[@class="8936-3fa8-2e8e-6957 price"]/text()')),'feature': feature})# 下一页page_a_list = hxf.xpath('//div[@class="3fa8-2e8e-6957-b8eb fanye"]/a')if len(page_a_list) > 0:for page_a in page_a_list:if self.common_util.get_extract(page_a.xpath('text()')) == '下一页':yield scrapy.FormRequest(url=meta['base_url'] + self.common_util.get_extract(page_a.xpath('@href')), method='GET', meta=meta, callback=self.parse_area)self.save()def save(self):if len(self.persistent_data) > self.save_threshold:try:self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)except AttributeError as e:self.dao = DaoUtils()self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)print('save except:', e)finally:self.persistent_data = http://www.kingceram.com/post/list()def save_final(self):if len(self.persistent_data)> 0:try:self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)except AttributeError as e:self.dao = DaoUtils()self.dao.customizable_add_ignore_batch(self.main_table, self.persistent_data)print('save_final except:', e)finally:self.persistent_data = http://www.kingceram.com/post/list()
最后是做统计的sql和代码:
SELECTarea_name,count(*) AS cFROM sou_fang_rentingGROUP BY area_nameORDER BY c DESC;
"""算算你再杭州的租房成本"""from thor_crawl.utils.db.daoUtil import DaoUtilsfrom thor_crawl.utils.db.mysql.mySQLConfig import MySQLConfigclass RentInHz:def __init__(self, *args, **kwargs):# ============ 工具 ============self.dao = DaoUtils(**{'dbType': 'MySQL', 'config': MySQLConfig.localhost()})def calc(self):hz_data = self.dao.get_all('SELECT area_name, area, unit_price FROM sou_fang_renting')temp = dict()for row in hz_data:if row['area_name'] in temp:temp[row['area_name']].append(row)else:temp[row['area_name']] = list()temp[row['area_name']].append(row)result = list()for x, y in temp.items():total = 0num = 0for row in y:try:# print(float(row['unit_price']))# print(float(str(row['area']).replace('㎡', '')))total += float(row['unit_price']) / float(str(row['area']).replace('㎡', ''))num += 1except ValueError as e:print(e, x, row)result.append({'城市': x, '平均数': total / num})print(result)def feature(self):hz_data = self.dao.get_all('SELECT feature FROM sou_fang_renting')feature_list = list()for row in hz_data:if row['feature'] is not None and row['feature'] != '':for x in str(row['feature']).split(","):feature_list.append(x)temp = dict()for row in feature_list:if row in temp:temp[row] = temp[row] + 1else:temp[row] = 1print(temp)if __name__ == '__main__':tj = RentInHz()tj.feature()