Compare commits

..

No commits in common. "104b6beb60ada07be92ee9c1db1c2ae824e200d2" and "e2324cd0960809cdfde827f475e294a073b67fa9" have entirely different histories.

3 changed files with 42 additions and 160 deletions

View File

@ -1,27 +1,26 @@
import asyncio
import csv
import gzip
import logging
import os
import pathlib
from base64 import b64encode
from io import BytesIO, StringIO
from typing import Dict, List
from zipfile import ZipFile, BadZipfile
import signal
from typing import Dict
import arrow
import httpx
import arrow
from base64 import b64encode
import pathlib
import logging
import asyncio
from zipfile import ZipFile, BadZipfile
from io import BytesIO
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Phase1:
class Downloader:
def __init__(self):
self.base_url = os.getenv("PHASE1_URL")
if not self.base_url:
raise ValueError("PHASE1_URL not set")
self.data: Dict[str, List[str]] = {}
self.base_url = "https://www.whoisds.com//whois-database/newly-registered-domains/{args}/nrd"
self.base_path = pathlib.Path("nrd")
self.data: Dict[str, BytesIO] = {}
if not self.base_path.exists():
self.base_path.mkdir()
async def fetch(self, date: arrow.Arrow) -> bool:
logger.info("Downloading: %s", date.format("YYYY-MM-DD"))
@ -37,146 +36,37 @@ class Phase1:
try:
with ZipFile(zip_file, "r") as zip_obj:
# print(zip_obj.read('domain-names.txt'))
self.data[date.format("YYYY-MM-DD")] = (
zip_obj.read("domain-names.txt").decode().splitlines()
self.data[date.format("YYYY-MM-DD")] = zip_obj.read(
"domain-names.txt"
)
except BadZipfile:
logger.error("Bad Zipfile: %s", url)
return False
return True
async def run(self, loop: asyncio.AbstractEventLoop):
async def write(self):
sort_date = sorted(self.data.keys(), reverse=True)
accumulate = ""
for date in range(len(sort_date)):
accumulate += self.data[sort_date[date]].decode()
accumulate = "\n".join(sorted(set(accumulate.split("\n"))))
self.base_path.joinpath(f"past-{date+1}day.txt").write_bytes(
accumulate.encode()
)
def run(self):
loop = asyncio.get_event_loop()
loop.add_signal_handler(signal.SIGILL, loop.stop)
loop.add_signal_handler(signal.SIGINT, loop.stop)
today = arrow.utcnow()
for i in range(1, 31, 5):
task = []
task = []
for i in range(1, 30, 5):
for j in range(i, i + 5):
date = today.shift(days=-j)
task.append(loop.create_task(self.fetch(date)))
await asyncio.gather(*task)
class Phase2:
def __init__(self):
self.base_url = os.getenv("PHASE2_URL")
if not self.base_url:
raise ValueError("PHASE2_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self):
now = arrow.utcnow()
async with httpx.AsyncClient() as client:
for files in ["nrd-1m.csv", "nrd-1w.csv"]:
url = self.base_url + files
logger.info("Downloading: %s", files)
r = await client.get(url)
if r.status_code != 200:
logger.error("Download failed: %s", files)
return False
if files == "nrd-1m.csv":
self.data[now.shift(months=-1).date().strftime("%Y-%m-%d")] = (
BytesIO(r.content).getvalue().decode().splitlines()
)
else:
self.data[now.shift(weeks=-1).date().strftime("%Y-%m-%d")] = (
BytesIO(r.content).getvalue().decode().splitlines()
)
async def run(self):
await self.fetch()
class Phase3:
def __init__(self):
self.base_url = os.getenv("PHASE3_URL")
if not self.base_url:
raise ValueError("PHASE3_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self):
async with httpx.AsyncClient() as client:
logger.info("Downloading: %s", self.base_url)
r = await client.get(self.base_url)
if r.status_code != 200:
logger.error("Download failed: %s", self.base_url)
return False
with gzip.GzipFile(fileobj=BytesIO(r.content), mode="rb") as f:
raw_data = BytesIO(f.read()).getvalue().decode()
data_file = StringIO(raw_data)
reader = csv.DictReader(data_file)
for row in reader:
if row["create_date"]:
self.data.setdefault(row["create_date"], []).append(row["domain_name"])
async def run(self):
await self.fetch()
class Phase4:
def __init__(self):
self.base_url = os.getenv("PHASE4_URL")
if not self.base_url:
raise ValueError("PHASE4_URL not set")
self.data: Dict[str, List[str]] = {}
async def fetch(self):
now = arrow.utcnow()
async with httpx.AsyncClient() as client:
logger.info("Downloading: %s", self.base_url)
r = await client.get(self.base_url)
if r.status_code != 200:
logger.error("Download failed: %s", self.base_url)
return False
date = now.shift(days=-7).date().strftime("%Y-%m-%d")
self.data[date] = r.text.splitlines()[2:-2]
async def run(self):
for i in range(5):
try:
await self.fetch()
except httpx.ReadTimeout:
logger.error("Phase4: Timeout, retrying")
continue
finally:
break
async def write_files(datalist: List[Dict[str, List[str]]]):
base_path = pathlib.Path("nrd")
if not base_path.exists():
base_path.mkdir()
combined_data: Dict[str, set] = {}
for data in datalist:
for key, value in data.items():
if key not in combined_data:
combined_data[key] = set(value)
else:
combined_data[key].update(value)
sort_date = sorted(combined_data.keys(), reverse=True)[:30]
accumulate = ""
for date in range(len(sort_date)):
accumulate += "\n".join(combined_data[sort_date[date]])
# accumulate = "\n".join(sorted(set(accumulate.split("\n"))))
base_path.joinpath(f"past-{(date + 1):02d}day.txt").write_bytes(
accumulate.encode()
)
loop.run_until_complete(asyncio.gather(*task))
asyncio.run(self.write())
if __name__ == "__main__":
import time
start = time.time()
loop = asyncio.get_event_loop()
ph1 = Phase1()
ph2 = Phase2()
ph3 = Phase3()
ph4 = Phase4()
task = [ph1.run(loop), ph2.run(), ph3.run(), ph4.run()]
loop.run_until_complete(asyncio.gather(*task))
logger.info("Download Complete, Now writing")
loop.run_until_complete(write_files([ph1.data, ph2.data, ph3.data, ph4.data]))
end = time.time() - start
logger.info(f"Time taken: {end:.2f} seconds")
Downloader().run()

View File

@ -52,17 +52,11 @@ https://t.me/adblock_tw
- <https://filter.futa.gg/TW165_domains.txt>
> 從網路上羅列的 NRD 清單經過整理的版本NRD (Newly Registered Domain
) 意思是近期註冊的網域,通常新註冊的網域有較高的風險,經常為詐騙集團所用,此清單提供過去 1 天致 30 天的清單
- <https://filter.futa.gg/ndr/past-01day_hosts.txt> (過去 1 天hosts 格式)
- <https://filter.futa.gg/ndr/past-07day_abp.txt> (過去 7 天adblock 格式)
| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 | NRD 清單(過去1天) |
| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------- |
| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_abp.txt) |
| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_hosts.txt) |
| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_domains.txt) |
| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 |
| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- |
| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) |
| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) |
| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) |

View File

@ -142,8 +142,6 @@
||godkendsuoabn.com^
||steamcommunllty.com^
||abloeic.online^
||mxwnaqo.cn^
||donggpt.cn^
! 加密貨幣釣魚
||app.exodus.com.alchemys.cl^