Merge pull request #121 from FutaGuard/new-ndr-list

 新增複數 NDR 來源清單整合
This commit is contained in:
踢低吸 2024-04-14 00:05:10 +08:00 committed by GitHub
commit 104b6beb60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 158 additions and 42 deletions

View File

@ -1,26 +1,27 @@
import signal
from typing import Dict
import httpx
import arrow
from base64 import b64encode
import pathlib
import logging
import asyncio
import csv
import gzip
import logging
import os
import pathlib
from base64 import b64encode
from io import BytesIO, StringIO
from typing import Dict, List
from zipfile import ZipFile, BadZipfile
from io import BytesIO
import arrow
import httpx
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Downloader:
class Phase1:
def __init__(self):
self.base_url = "https://www.whoisds.com//whois-database/newly-registered-domains/{args}/nrd"
self.base_path = pathlib.Path("nrd")
self.data: Dict[str, BytesIO] = {}
if not self.base_path.exists():
self.base_path.mkdir()
self.base_url = os.getenv("PHASE1_URL")
if not self.base_url:
raise ValueError("PHASE1_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self, date: arrow.Arrow) -> bool:
logger.info("Downloading: %s", date.format("YYYY-MM-DD"))
@ -36,37 +37,146 @@ class Downloader:
try:
with ZipFile(zip_file, "r") as zip_obj:
# print(zip_obj.read('domain-names.txt'))
self.data[date.format("YYYY-MM-DD")] = zip_obj.read(
"domain-names.txt"
self.data[date.format("YYYY-MM-DD")] = (
zip_obj.read("domain-names.txt").decode().splitlines()
)
except BadZipfile:
logger.error("Bad Zipfile: %s", url)
return False
return True
async def write(self):
sort_date = sorted(self.data.keys(), reverse=True)
accumulate = ""
for date in range(len(sort_date)):
accumulate += self.data[sort_date[date]].decode()
accumulate = "\n".join(sorted(set(accumulate.split("\n"))))
self.base_path.joinpath(f"past-{date+1}day.txt").write_bytes(
accumulate.encode()
)
def run(self):
loop = asyncio.get_event_loop()
loop.add_signal_handler(signal.SIGILL, loop.stop)
loop.add_signal_handler(signal.SIGINT, loop.stop)
async def run(self, loop: asyncio.AbstractEventLoop):
today = arrow.utcnow()
task = []
for i in range(1, 30, 5):
for i in range(1, 31, 5):
task = []
for j in range(i, i + 5):
date = today.shift(days=-j)
task.append(loop.create_task(self.fetch(date)))
loop.run_until_complete(asyncio.gather(*task))
asyncio.run(self.write())
await asyncio.gather(*task)
class Phase2:
def __init__(self):
self.base_url = os.getenv("PHASE2_URL")
if not self.base_url:
raise ValueError("PHASE2_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self):
now = arrow.utcnow()
async with httpx.AsyncClient() as client:
for files in ["nrd-1m.csv", "nrd-1w.csv"]:
url = self.base_url + files
logger.info("Downloading: %s", files)
r = await client.get(url)
if r.status_code != 200:
logger.error("Download failed: %s", files)
return False
if files == "nrd-1m.csv":
self.data[now.shift(months=-1).date().strftime("%Y-%m-%d")] = (
BytesIO(r.content).getvalue().decode().splitlines()
)
else:
self.data[now.shift(weeks=-1).date().strftime("%Y-%m-%d")] = (
BytesIO(r.content).getvalue().decode().splitlines()
)
async def run(self):
await self.fetch()
class Phase3:
def __init__(self):
self.base_url = os.getenv("PHASE3_URL")
if not self.base_url:
raise ValueError("PHASE3_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self):
async with httpx.AsyncClient() as client:
logger.info("Downloading: %s", self.base_url)
r = await client.get(self.base_url)
if r.status_code != 200:
logger.error("Download failed: %s", self.base_url)
return False
with gzip.GzipFile(fileobj=BytesIO(r.content), mode="rb") as f:
raw_data = BytesIO(f.read()).getvalue().decode()
data_file = StringIO(raw_data)
reader = csv.DictReader(data_file)
for row in reader:
if row["create_date"]:
self.data.setdefault(row["create_date"], []).append(row["domain_name"])
async def run(self):
await self.fetch()
class Phase4:
def __init__(self):
self.base_url = os.getenv("PHASE4_URL")
if not self.base_url:
raise ValueError("PHASE4_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self):
now = arrow.utcnow()
async with httpx.AsyncClient() as client:
logger.info("Downloading: %s", self.base_url)
r = await client.get(self.base_url)
if r.status_code != 200:
logger.error("Download failed: %s", self.base_url)
return False
date = now.shift(days=-7).date().strftime("%Y-%m-%d")
self.data[date] = r.text.splitlines()[2:-2]
async def run(self):
for i in range(5):
try:
await self.fetch()
except httpx.ReadTimeout:
logger.error("Phase4: Timeout, retrying")
continue
finally:
break
async def write_files(datalist: List[Dict[str, List[str]]]):
base_path = pathlib.Path("nrd")
if not base_path.exists():
base_path.mkdir()
combined_data: Dict[str, set] = {}
for data in datalist:
for key, value in data.items():
if key not in combined_data:
combined_data[key] = set(value)
else:
combined_data[key].update(value)
sort_date = sorted(combined_data.keys(), reverse=True)[:30]
accumulate = ""
for date in range(len(sort_date)):
accumulate += "\n".join(combined_data[sort_date[date]])
# accumulate = "\n".join(sorted(set(accumulate.split("\n"))))
base_path.joinpath(f"past-{(date + 1):02d}day.txt").write_bytes(
accumulate.encode()
)
if __name__ == "__main__":
Downloader().run()
import time
start = time.time()
loop = asyncio.get_event_loop()
ph1 = Phase1()
ph2 = Phase2()
ph3 = Phase3()
ph4 = Phase4()
task = [ph1.run(loop), ph2.run(), ph3.run(), ph4.run()]
loop.run_until_complete(asyncio.gather(*task))
logger.info("Download Complete, Now writing")
loop.run_until_complete(write_files([ph1.data, ph2.data, ph3.data, ph4.data]))
end = time.time() - start
logger.info(f"Time taken: {end:.2f} seconds")

View File

@ -52,11 +52,17 @@ https://t.me/adblock_tw
- <https://filter.futa.gg/TW165_domains.txt>
| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 |
| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- |
| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) |
| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) |
| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) |
> 從網路上羅列的 NRD 清單經過整理的版本NRD (Newly Registered Domain
) 意思是近期註冊的網域,通常新註冊的網域有較高的風險,經常為詐騙集團所用,此清單提供過去 1 天致 30 天的清單
- <https://filter.futa.gg/ndr/past-01day_hosts.txt> (過去 1 天hosts 格式)
- <https://filter.futa.gg/ndr/past-07day_abp.txt> (過去 7 天adblock 格式)
| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 | NRD 清單(過去1天) |
| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------- |
| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_abp.txt) |
| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_hosts.txt) |
| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_domains.txt) |