Content-Length: 548795 | pFad | http://github.com/cwjokaka/ok_ip_proxy_pool/commit/dd9f22f3c9ea374781ebb32b7106e3ec4bfa5b59

34 添加:匿名性校验 · cwjokaka/ok_ip_proxy_pool@dd9f22f · GitHub
Skip to content

Commit

Permalink
添加:匿名性校验
Browse files Browse the repository at this point in the history
  • Loading branch information
cwjokaka committed Sep 28, 2019
1 parent b4e5c06 commit dd9f22f
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 50 deletions.
44 changes: 2 additions & 42 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,4 @@
import asyncio
import typing

from apscheduler.schedulers.background import BackgroundScheduler

from src.database.sqlite_opt import sqlite_opt
from src.entity.proxy_entity import ProxyEntity
from src.log.logger import logger
from src.spider.spiders import spider_collection
from setting import WEB_SERVER, VALIDATOR, SPIDER
from src.validator.validator import validator
from src.web.web_flask import app


def crawl():
proxies = []
tasks = []
for spider_name in SPIDER['list']:
tasks.append(spider_collection[spider_name].crawl())
# proxies.extend(spider_collection[spider_name].crawl())
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
for proxies_list in results:
proxies.extend(proxies_list)
# proxies = loop.run_until_complete(asyncio.gather(*tasks))
# 持久化
save(proxies)


def save(proxies: typing.List[ProxyEntity]):
for proxy in proxies:
sqlite_opt.add_proxy(proxy)

from src.runner import run

if __name__ == '__main__':
logger.info('初始化sqlite数据库...')
sqlite_opt.init_db()
scheduler = BackgroundScheduler()
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval'])
scheduler.start()
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])
run()
14 changes: 11 additions & 3 deletions setting.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# 代理爬虫配置
SPIDER = {
'crawl_interval': 60, # 爬取IP代理的间隔(秒)
'crawl_interval': 120, # 爬取IP代理的间隔(秒)
'list': [ # 使用的代理爬虫(类名)
'Spider66Ip',
'SpiderQuanWangIp',
Expand All @@ -14,9 +14,17 @@

# 校验器配置
VALIDATOR = {
'test_url': 'http://www.baidu.com',
'test_url': 'http://www.baidu.com', # 可用校验url
'request_timeout': 4, # 校验超时时间
'validate_interval': 30
'validate_interval': 60 # 校验间隔(秒)
}

# 匿名性校验配置
ANONYMITY_VALIDATOR = {
'http_test_url': 'http://httpbin.org/get', # 匿名校验url
'https_test_url': 'https://httpbin.org/get',
'request_timeout': 4, # 校验最大超时时间
'interval': 180 # 校验间隔(秒)
}

# 数据库配置
Expand Down
6 changes: 6 additions & 0 deletions src/database/abs_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@ def add_proxy(self, proxy):
def get_all_proxies(self):
raise NotImplementedError

def get_unknown_anonymity_proxies(self):
raise NotImplementedError

def increase_reliability(self, url):
raise NotImplementedError

def reduce_reliability(self, url):
raise NotImplementedError

def update_anonymity(self, url, value):
raise NotImplementedError

def remove(self, key):
raise NotImplementedError

Expand Down
30 changes: 29 additions & 1 deletion src/database/sqlite_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sqlalchemy import create_engine, desc
from sqlalchemy.orm import sessionmaker
from src.entity.proxy_entity import ProxyEntity
from src.enum.common import ProxyCoverEnum
from src.log.logger import logger
import sqlite3

Expand Down Expand Up @@ -35,7 +36,19 @@ def get_all_proxies(self):
return session.query(ProxyEntity).all()
except Exception as e:
logger.exception(e)
pass
finally:
session.close()
return []

def get_unknown_anonymity_proxies(self):
session = self._DBSession()
try:
return (session.query(ProxyEntity)
.filter(ProxyEntity.reliability > 0)
.filter(ProxyEntity.proxy_cover == ProxyCoverEnum.UNKNOWN.value)
.all())
except Exception as e:
logger.exception(e)
finally:
session.close()
return []
Expand Down Expand Up @@ -79,6 +92,21 @@ def reduce_reliability(self, url):
def remove(self, key):
return super().remove(key)

def update_anonymity(self, url, value):
conn = self._get_connect()
cursor = conn.cursor()
try:
cursor.execute(f"""
UPDATE {DB["table_name"]} SET proxy_cover = {value}
WHERE url='{url}'
""")
conn.commit()
except Exception as e:
logger.exception(e)
finally:
cursor.close()
conn.close()

def init_db(self):
conn = self._get_connect()
cursor = conn.cursor()
Expand Down
3 changes: 2 additions & 1 deletion src/entity/proxy_entity.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from src.enum.common import ProxyTypeEnum, ProxyCoverEnum
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String
from setting import DB
Base = declarative_base()


class ProxyEntity(Base):
__tablename__ = 'proxy'
__tablename__ = DB['table_name']
url = Column(String(36), primary_key=True)
# ip = Column(String(20))
# port = Column(String(5))
Expand Down
46 changes: 46 additions & 0 deletions src/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import asyncio
import typing

from apscheduler.schedulers.background import BackgroundScheduler

from src.database.sqlite_opt import sqlite_opt
from src.entity.proxy_entity import ProxyEntity
from src.log.logger import logger
from src.spider.spiders import spider_collection
from setting import WEB_SERVER, VALIDATOR, SPIDER, ANONYMITY_VALIDATOR
from src.validator.validator import validator
from src.validator.anonymity_validator import anonymity_validator
from src.web.web_flask import app


def crawl():
proxies = []
tasks = []
for spider_name in SPIDER['list']:
tasks.append(spider_collection[spider_name].crawl())
# proxies.extend(spider_collection[spider_name].crawl())
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
for proxies_list in results:
proxies.extend(proxies_list)
# proxies = loop.run_until_complete(asyncio.gather(*tasks))
# 持久化
save(proxies)


def save(proxies: typing.List[ProxyEntity]):
for proxy in proxies:
sqlite_opt.add_proxy(proxy)


def run():
logger.info('初始化sqlite数据库...')
sqlite_opt.init_db()
scheduler = BackgroundScheduler()
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval'])
scheduler.add_job(anonymity_validator.run, 'interval', seconds=ANONYMITY_VALIDATOR['interval'])
scheduler.start()
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])
5 changes: 2 additions & 3 deletions src/spider/abs_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@ def __init__(self, name='unknown') -> None:
self._name = name

async def crawl(self):
res = []
logger.info(f'{self._name}开始爬取...')
try:
res.extend(await self.do_crawl())
return await self.do_crawl()
except Exception as e:
logger.exception(f'{self._name}爬取失败:e:{e}')
return res
return []

async def do_crawl(self) -> List[ProxyEntity]:
raise NotImplementedError
Expand Down
69 changes: 69 additions & 0 deletions src/validator/anonymity_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import asyncio
import json

import aiohttp

from setting import ANONYMITY_VALIDATOR, HEADERS
from src.database.sqlite_opt import sqlite_opt
from src.enum.common import ProxyCoverEnum, ProxyTypeEnum
from src.log.logger import logger


class AnonymityValidator(object):

urls = {
ProxyTypeEnum.UNKNOWN.value: ANONYMITY_VALIDATOR['http_test_url'],
ProxyTypeEnum.HTTP.value: ANONYMITY_VALIDATOR['http_test_url'],
ProxyTypeEnum.HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'],
ProxyTypeEnum.HTTP_AND_HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'],
}

def run(self):
# 获取proxy列表
proxy_list = sqlite_opt.get_unknown_anonymity_proxies()
if len(proxy_list) > 0:
tasks = [self.valid_proxy(proxy.url, proxy.proxy_type) for proxy in proxy_list]
asyncio.run(asyncio.wait(tasks))

async def valid_proxy(self, proxy_url, proxy_type):
async with aiohttp.ClientSession() as session:
try:
async with session.get(self.urls[proxy_type],
proxy=proxy_url,
headers=HEADERS,
timeout=ANONYMITY_VALIDATOR['request_timeout']) as resp:
if resp.status == 200:
# 检验其匿名性
r_dict = json.loads(await resp.text())
headers = r_dict.get('headers', '')
ip = r_dict.get('origen')
proxy_connection = headers.get('Proxy-Connection', None)
flag = True
if ',' in ip:
ips = str.split(ip, ',')
first = ips[0]
for p in ips:
if first != p.lstrip():
proxy_cover = ProxyCoverEnum.TRANSPARENT.value # 透明
flag = False
break
if flag:
if proxy_connection:
proxy_cover = ProxyCoverEnum.NORMAL_COVER.value # 普匿
else:
proxy_cover = ProxyCoverEnum.HIGH_COVER.value # 高匿
# 更新匿名性
sqlite_opt.update_anonymity(proxy_url, proxy_cover)
logger.info(f'验证匿名性成功: url:{proxy_url}, coverValue:{proxy_cover}')
else:
logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, 返回码:{resp.status}')
except asyncio.TimeoutError:
logger.warn(f'验证匿名性请求超时, proxy_url:{proxy_url}')
except ConnectionRefusedError:
logger.warn(f'验证匿名性请求被拒绝, proxy_url:{proxy_url}')
except Exception as e:
# logger.exception(e)
logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, e:{e}')


anonymity_validator = AnonymityValidator()
18 changes: 18 additions & 0 deletions test/validator/test_anonymity_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import unittest

from src.database.sqlite_opt import sqlite_opt
from src.validator.anonymity_validator import anonymity_validator


class TestAnonymityValidator(unittest.TestCase):

def setUp(self) -> None:
self._opt = sqlite_opt
self._validator = anonymity_validator

# self._opt.clean()

def test_valid_proxy(self):
self._validator.run()
pass

0 comments on commit dd9f22f

Please sign in to comment.








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/cwjokaka/ok_ip_proxy_pool/commit/dd9f22f3c9ea374781ebb32b7106e3ec4bfa5b59

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy