Content-Length: 394350 | pFad | http://github.com/cwjokaka/ok_ip_proxy_pool/commit/c6f56400eba096931c3bc938d783ad441149c9c5

74 add: 西刺代理爬取 · cwjokaka/ok_ip_proxy_pool@c6f5640 · GitHub
Skip to content

Commit

Permalink
add: 西刺代理爬取
Browse files Browse the repository at this point in the history
  • Loading branch information
cwjokaka committed Sep 8, 2019
1 parent 37f4b89 commit c6f5640
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 8 deletions.
11 changes: 8 additions & 3 deletions setting.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
from src.spider.spider_66_ip import Spider66Ip
# from src.spider.spider_66_ip import Spider66Ip
# from src.spider.spider_quan_wang_ip import SpiderQuanWangIp
from src.database.memory_db import *

DB_TYPE = 'memory' # memory/redis
DB = {
'db_typy': 'memory' # memory/redis
}

SPIDER_LIST = [Spider66Ip]
# SPIDER_LIST = [Spider66Ip, SpiderQuanWangIp]


# 爬虫请求头
HEADERS = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
Expand Down
5 changes: 4 additions & 1 deletion src/database/memory.py → src/database/memory_db.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from src.database.abs_database import AbsDatabase


class Memory(AbsDatabase):
class MemoryDB(AbsDatabase):
"""
数据库:基于内存实现
"""
Expand All @@ -16,3 +16,6 @@ def get(self, key):

def remove(self, key):
return self._box.pop(key, None)


memory_db_instance = MemoryDB()
5 changes: 4 additions & 1 deletion src/spider/abs_spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Iterable
from typing import List

from src.entity.proxy_entity import ProxyEntity

Expand All @@ -12,7 +12,10 @@ def crawl(self):
print(f'{self._name}开始爬取...')
res = self.do_crawl()
print(f'{self._name}爬取完毕!共:{len(res)}个代理')
# todo 持久化到数据库

return res

def do_crawl(self) -> List[ProxyEntity]:
raise RuntimeError('do_crawl方法没有实现!')

12 changes: 9 additions & 3 deletions src/spider/spider_66_ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import requests

from src.entity.proxy_entity import ProxyEntity
from src.enum.common import ProxyCoverEnum
from src.spider.abs_spider import AbsSpider
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -31,13 +32,18 @@ def do_crawl(self) -> List[ProxyEntity]:
ip = contents[0].text
port = contents[1].text
region = contents[2].text
proxy_type = contents[3].text
check_time = contents[4].text
proxy_cover = contents[3].text
# check_time = contents[4].text
# print(f'{ip}:{port}/{region}/{proxy_type}/{check_time}')
result.append(ProxyEntity(ip, port,
source=self._name,
proxy_type=proxy_type,
proxy_cover=self._judge_proxy_cover(proxy_cover),
region=region))
return result


def _judge_proxy_cover(self, cover_str: str):
if cover_str == '高匿代理':
return ProxyCoverEnum.HIGH_COVER
else:
return ProxyCoverEnum.UNKNOWN
60 changes: 60 additions & 0 deletions src/spider/spider_xici_ip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import List, Iterable

import requests

from src.entity.proxy_entity import ProxyEntity
from src.enum.common import ProxyCoverEnum, ProxyTypeEnum
from src.spider.abs_spider import AbsSpider
from bs4 import BeautifulSoup
from setting import HEADERS


class SpiderXiciIp(AbsSpider):
"""
西刺代理爬虫 刷新速度:🐌慢
https://www.xicidaili.com/
"""
def __init__(self) -> None:
super().__init__('西刺IP代理爬虫')
self._base_urls = [
'https://www.xicidaili.com/nn', # 高匿
'https://www.xicidaili.com/nt' # 透明
]

def do_crawl(self) -> List[ProxyEntity]:
result = []
for base_url in self._base_urls:
for page in range(1, 2):
res = requests.get(f'{base_url}/{page}', headers=HEADERS)
soup = BeautifulSoup(res.text, 'lxml')
tr_list = soup.find('table', attrs={'id': 'ip_list'}).find_all('tr')[1: -1]
for tr in tr_list:
tds = tr.find_all('td')
# country = tds[0].find('img')['alt']
ip = tds[1].text
port = tds[2].text
city = tds[3].text
proxy_cover = tds[4].text
proxy_type = tds[5].text
result.append(ProxyEntity(ip, port,
source=self._name,
proxy_cover=self._judge_proxy_cover(proxy_cover),
proxy_type=self._judge_proxy_type(proxy_type),
))
return result

def _judge_proxy_cover(self, cover_str: str):
if cover_str == '高匿':
return ProxyCoverEnum.HIGH_COVER
if cover_str == '透明':
return ProxyCoverEnum.TRANSPARENT
else:
return ProxyCoverEnum.UNKNOWN

def _judge_proxy_type(self, type_str: str):
if type_str == 'HTTPS':
return ProxyTypeEnum.HTTPS
if type_str == 'HTTP':
return ProxyTypeEnum.HTTP
else:
return ProxyTypeEnum.UNKNOWN
14 changes: 14 additions & 0 deletions test/spider/test_spider_xici_Ip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import unittest

from src.spider.spider_xici_ip import SpiderXiciIp


class TestSpiderXiciIp(unittest.TestCase):

def setUp(self) -> None:
self._spider = SpiderXiciIp()

def test_crawl(self):
result = self._spider.crawl()
assert result
assert len(result) > 0

0 comments on commit c6f5640

Please sign in to comment.








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/cwjokaka/ok_ip_proxy_pool/commit/c6f56400eba096931c3bc938d783ad441149c9c5

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy