爬取汽车之家北京二手车信息

爬取汽车之家北京二手车信息
经测试,该网站: 反爬机制较低,仅需要伪造请求头设置爬取速率,但是100页之后需要登录,登录之后再爬要慎重,一不小心就会永久封号 。爬取的数据以各种类型存放,下面展示保存到mysql数据库中:
代码解析:

爬取汽车之家北京二手车信息

文章插图
程序源码自提:
爬虫主程序
# 汽车之家爬虫,北京二手车import requestsfrom lxml import etreefrom data_save import *import timeclass Car_second():name = ''gonglishu = ''brought_year = ''location = ''img_url = ''price = ''def getInfors(url,i):print("Page %d is saving." % i)# 构造请求头headers = {"Cache-Control":"no-cache","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36","Referer":"https://www.che168.com/beijing/list/",}response = requests.get(url=url,headers=headers)html = response.textob_xml = etree.HTML(html)infos = ob_xml.xpath('//*[@id="viewlist_ul"]//li[not(contains(@class,"adv-img"))]/a')secondCars = []for info in infos:if info.xpath('.//img/@src2') == []:img = info.xpath('.//img/@src')[0]else:img = info.xpath('.//img/@src2')[0]name = info.xpath('.//h4/text()')[0]price = info.xpath('.//span[@class="13f0-d924-c10f-7880 price"]/text()')[0] + info.xpath('.//em/text()')[0]myl = info.xpath('.//p/text()')[0].split('/')gonglishu = myl[0]brought_year = myl[1]location = myl[2]secondCar = Car_second()secondCar.name = namesecondCar.img_url = imgsecondCar.brought_year = brought_yearsecondCar.location = locationsecondCar.gonglishu = gonglishusecondCar.price = pricesecondCars.append(secondCar)return secondCarsif __name__ == '__main__':url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1ltocsp{}exx0/'for i in range(1,101):car_infors = getInfors(url.format(i),i)time.sleep(0.95)#savdFile(car_infors)saveMysql(car_infors)
保存数据
【爬取汽车之家北京二手车信息】def savdFile(datas):# 保存到文本文件with open('J:\DATAs\北京市二手车(汽车之家)\data.txt','a+',encoding='utf-8') as f:for data in datas:# try:#name = data.name#gonglishu = data.gonglishu#brought_year = data.brought_year#location = data.location#img_url = data.img_url#price = data.price#writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"图片地址:"+img_url#f.write(writeCont+'\n\n')# except:#print(writeCont)name = data.namegonglishu = data.gonglishubrought_year = data.brought_yearlocation = data.locationimg_url = data.img_urlprice = data.pricewriteCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"图片地址:"+img_urlf.write(writeCont+'\n\n')print('保存完成 。')# 将数据保存到数据库中fromsqlalchemy import Column,create_engine,Integer,Stringfrom sqlalchemy.ext.declarative import declarative_basefrom sqlalchemy.orm import sessionmakerBase = declarative_base()class Car(Base):__tablename__ = "second_cars"id = Column(Integer,primary_key=True,autoincrement=True,nullable=False)carName = Column(String(100))gonglishu = Column(String(20))brought_year = Column(String(10))location = Column(String(10))image_url = Column(String(200))price = Column(String(10))def saveMysql(datas):connect = create_engine("mysql+pymysql://root:root@127.0.0.1:3306/second_cars",encoding='utf-8',echo=True)Base.metadata.create_all(connect)DBsession = sessionmaker(bind=connect)session = DBsession()for data in datas:car = Car(carName=data.name,gonglishu = data.gonglishu,brought_year = data.brought_year,price=data.price,location = data.location,image_url = data.img_url,)session.add(car)session.commit()session.close()
反思
保存到mysql数据库是,创建新对象并传参时有点复杂,我曾经记得有种很简单明了的方法,现在怎么也想不起来,望指教哈 。