I am trying the Scrapy framework to extract some information from LinkedIn. This is my scraper for crawling linkedin profile. When I run this code, it shows response 999 status code on check_login_response, so I can't login to linkedin.
How can I solve this problem?
================= Code =====================
import scrapy
import requests
import re
from scrapy.http import Request, FormRequest
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor
from scrapy.conf import settings
class LinkedinProfileItem(scrapy.Item):
Profile_Image_Link = scrapy.Field()
Name = scrapy.Field()
Profile_Summary = scrapy.Field()
Headline = scrapy.Field()
Location = scrapy.Field()
Industry = scrapy.Field()
class LinkedinProfilesSpider(CrawlSpider):
name = 'linkedin_scrapy'
allowed_domains = ['www.linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
login_url = "https://www.linkedin.com"
start_urls = ['https://www.linkedin.com/in/luca-candela-a83738143']
handle_httpstatus_list = [999]
settings.overrides['ROBOTSTXT_OBEY'] = False
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'upgrade-insecure-requests': '1',
"accept-encoding": "gzip, deflate, sdch, br",
"accept-language": "en-US,en;q=0.8,ms;q=0.6",
"X-Requested-With": "XMLHttpRequest",
'referer': 'https://www.linkedin.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
DOWNLOADER_MIDDLEWARES = {'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, }
rules = (
Rule(
LinkExtractor(allow=[r'.*']),
callback='parse_item',
follow=True,
),
)
def start_requests(self):
self.username = self.settings['myemail']
self.password = self.settings['password']
yield Request(
url=self.login_url,
callback=self.login,
dont_filter=True,
)
def login(self, response):
token = response.xpath("//meta[@name='lnkd-track-error']/@content").extract()
try:
real_token = re.search('csrfToken=(.*)', token[0]).group(1)
except:
self.log("Error Token")
real_token = None
if real_token:
return Request(
url=self.start_urls[0],
headers=self.headers,
cookies={'JSESSIONID': real_token},
callback=self.check_login_response,
)
def check_login_response(self, response):
if self.username not in response.body:
self.log("Login failed")
return
self.log("Successfully logged in")
return [Request(url=url, dont_filter=True) for url in self.start_urls]
def parse_item(self, response):
Item = LinkedinProfileItem()
return Item