创建一个scrapy项目,命令如下
Scrapy start project dangdang

进入项目选择一个模版创建爬虫,命令如下
scrapy genspider -t basic dd dangdang.com

运行scrapy命令如下
scrapy crawl dd –nolog(带上不输出日志)
dd.py
import scrapy from scrapy import FormRequest from dangdang.items import DangdangItem #导入items的DangdangItem from scrapy.http import Request # scrapy startproject dangdang 创建项目 # scrapy genspider -t basic dd dangdang.com 创建爬虫 # scrapy crawl dd --nolog 运行爬虫 class DdSpider(scrapy.Spider): name = 'dd' allowed_domains = ['dangdang.com'] start_urls = ['http://category.dangdang.com/pg1-cid4008154.html'] ''' def start_requests(self): #如果有start_requestsz()这个方法,就默认执行这个方法,而不执行start_urls return [Request("http://******.login.html",meta={"cookiejar":1},callback=self.parse)] #return与yield的区别就是一个结束一个不结束 ''' def parse(self, response): data={"num":"ddd", "pwd":"123"} return [FormRequest.from_response(response, #设置cookie新秀 meta={"cookiejar":response.meta["cookiejar"]}, #模拟浏览器 headers=self.header, #设置post表单数据 formdata=data, callback=self.next)] #所有的信息都在response #获取源代码:response.body 用正则时需要使用body提取所有信息 item = DangdangItem() # extract是解压 item["title"] = response.xpath('//a[@class="pic"]/@title').extract() item["link"] = response.xpath('//a[@dd_name="单品图片"]/@href').extract() item["comment"] = response.xpath('//a[@dd_name="单品评论"]/text()').extract() yield item #每爬一页就把数据提交到pipelines中 for i in range(2, 81): url = 'http://category.dangdang.com/pg' + str(i) + '-cid4008154.html' yield Request(url,callback=self.parse)#回调函数 Request进入网址
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class DangdangItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title=scrapy.Field() link=scrapy.Field() comment=scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class DangdangPipeline(object):#默认不开启,需要开启 def process_item(self, item, spider): for i in range(0, len(item["title"])): title = item["title"][i] link = item["link"][i] comment = item["comment"][i] fh=open("/Users/new/lean/test/a.txt",'a+') fh.write(str(title)+":"+str(link)+":"+str(comment)+'\n') fh.close() return item
记住这个网站了
最好再详细点
我也是学计算机的
给你点赞