-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
185 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,4 @@ | ||
import asyncio | ||
import typing | ||
|
||
from apscheduler.schedulers.background import BackgroundScheduler | ||
|
||
from src.database.sqlite_opt import sqlite_opt | ||
from src.entity.proxy_entity import ProxyEntity | ||
from src.log.logger import logger | ||
from src.spider.spiders import spider_collection | ||
from setting import WEB_SERVER, VALIDATOR, SPIDER | ||
from src.validator.validator import validator | ||
from src.web.web_flask import app | ||
|
||
|
||
def crawl(): | ||
proxies = [] | ||
tasks = [] | ||
for spider_name in SPIDER['list']: | ||
tasks.append(spider_collection[spider_name].crawl()) | ||
# proxies.extend(spider_collection[spider_name].crawl()) | ||
loop = asyncio.new_event_loop() | ||
asyncio.set_event_loop(loop) | ||
results = loop.run_until_complete(asyncio.gather(*tasks)) | ||
loop.close() | ||
for proxies_list in results: | ||
proxies.extend(proxies_list) | ||
# proxies = loop.run_until_complete(asyncio.gather(*tasks)) | ||
# 持久化 | ||
save(proxies) | ||
|
||
|
||
def save(proxies: typing.List[ProxyEntity]): | ||
for proxy in proxies: | ||
sqlite_opt.add_proxy(proxy) | ||
|
||
from src.runner import run | ||
|
||
if __name__ == '__main__': | ||
logger.info('初始化sqlite数据库...') | ||
sqlite_opt.init_db() | ||
scheduler = BackgroundScheduler() | ||
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval']) | ||
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval']) | ||
scheduler.start() | ||
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port']) | ||
run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import asyncio | ||
import typing | ||
|
||
from apscheduler.schedulers.background import BackgroundScheduler | ||
|
||
from src.database.sqlite_opt import sqlite_opt | ||
from src.entity.proxy_entity import ProxyEntity | ||
from src.log.logger import logger | ||
from src.spider.spiders import spider_collection | ||
from setting import WEB_SERVER, VALIDATOR, SPIDER, ANONYMITY_VALIDATOR | ||
from src.validator.validator import validator | ||
from src.validator.anonymity_validator import anonymity_validator | ||
from src.web.web_flask import app | ||
|
||
|
||
def crawl(): | ||
proxies = [] | ||
tasks = [] | ||
for spider_name in SPIDER['list']: | ||
tasks.append(spider_collection[spider_name].crawl()) | ||
# proxies.extend(spider_collection[spider_name].crawl()) | ||
loop = asyncio.new_event_loop() | ||
asyncio.set_event_loop(loop) | ||
results = loop.run_until_complete(asyncio.gather(*tasks)) | ||
loop.close() | ||
for proxies_list in results: | ||
proxies.extend(proxies_list) | ||
# proxies = loop.run_until_complete(asyncio.gather(*tasks)) | ||
# 持久化 | ||
save(proxies) | ||
|
||
|
||
def save(proxies: typing.List[ProxyEntity]): | ||
for proxy in proxies: | ||
sqlite_opt.add_proxy(proxy) | ||
|
||
|
||
def run(): | ||
logger.info('初始化sqlite数据库...') | ||
sqlite_opt.init_db() | ||
scheduler = BackgroundScheduler() | ||
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval']) | ||
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval']) | ||
scheduler.add_job(anonymity_validator.run, 'interval', seconds=ANONYMITY_VALIDATOR['interval']) | ||
scheduler.start() | ||
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import asyncio | ||
import json | ||
|
||
import aiohttp | ||
|
||
from setting import ANONYMITY_VALIDATOR, HEADERS | ||
from src.database.sqlite_opt import sqlite_opt | ||
from src.enum.common import ProxyCoverEnum, ProxyTypeEnum | ||
from src.log.logger import logger | ||
|
||
|
||
class AnonymityValidator(object): | ||
|
||
urls = { | ||
ProxyTypeEnum.UNKNOWN.value: ANONYMITY_VALIDATOR['http_test_url'], | ||
ProxyTypeEnum.HTTP.value: ANONYMITY_VALIDATOR['http_test_url'], | ||
ProxyTypeEnum.HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'], | ||
ProxyTypeEnum.HTTP_AND_HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'], | ||
} | ||
|
||
def run(self): | ||
# 获取proxy列表 | ||
proxy_list = sqlite_opt.get_unknown_anonymity_proxies() | ||
if len(proxy_list) > 0: | ||
tasks = [self.valid_proxy(proxy.url, proxy.proxy_type) for proxy in proxy_list] | ||
asyncio.run(asyncio.wait(tasks)) | ||
|
||
async def valid_proxy(self, proxy_url, proxy_type): | ||
async with aiohttp.ClientSession() as session: | ||
try: | ||
async with session.get(self.urls[proxy_type], | ||
proxy=proxy_url, | ||
headers=HEADERS, | ||
timeout=ANONYMITY_VALIDATOR['request_timeout']) as resp: | ||
if resp.status == 200: | ||
# 检验其匿名性 | ||
r_dict = json.loads(await resp.text()) | ||
headers = r_dict.get('headers', '') | ||
ip = r_dict.get('origen') | ||
proxy_connection = headers.get('Proxy-Connection', None) | ||
flag = True | ||
if ',' in ip: | ||
ips = str.split(ip, ',') | ||
first = ips[0] | ||
for p in ips: | ||
if first != p.lstrip(): | ||
proxy_cover = ProxyCoverEnum.TRANSPARENT.value # 透明 | ||
flag = False | ||
break | ||
if flag: | ||
if proxy_connection: | ||
proxy_cover = ProxyCoverEnum.NORMAL_COVER.value # 普匿 | ||
else: | ||
proxy_cover = ProxyCoverEnum.HIGH_COVER.value # 高匿 | ||
# 更新匿名性 | ||
sqlite_opt.update_anonymity(proxy_url, proxy_cover) | ||
logger.info(f'验证匿名性成功: url:{proxy_url}, coverValue:{proxy_cover}') | ||
else: | ||
logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, 返回码:{resp.status}') | ||
except asyncio.TimeoutError: | ||
logger.warn(f'验证匿名性请求超时, proxy_url:{proxy_url}') | ||
except ConnectionRefusedError: | ||
logger.warn(f'验证匿名性请求被拒绝, proxy_url:{proxy_url}') | ||
except Exception as e: | ||
# logger.exception(e) | ||
logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, e:{e}') | ||
|
||
|
||
anonymity_validator = AnonymityValidator() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import unittest | ||
|
||
from src.database.sqlite_opt import sqlite_opt | ||
from src.validator.anonymity_validator import anonymity_validator | ||
|
||
|
||
class TestAnonymityValidator(unittest.TestCase): | ||
|
||
def setUp(self) -> None: | ||
self._opt = sqlite_opt | ||
self._validator = anonymity_validator | ||
|
||
# self._opt.clean() | ||
|
||
def test_valid_proxy(self): | ||
self._validator.run() | ||
pass | ||
|