From 11ffc8dea97719ccb10798f4e78137f159f1c1ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B8=A2=E4=BD=8E=E5=90=B8?= Date: Sat, 13 Apr 2024 21:52:16 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9A=A1=20=E6=96=B0=E5=A2=9E=E8=A4=87?= =?UTF-8?q?=E6=95=B8=20NDR=20=E4=BE=86=E6=BA=90=E6=B8=85=E5=96=AE=E6=95=B4?= =?UTF-8?q?=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AutoBuild/nrdlist.py | 145 ++++++++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 37 deletions(-) diff --git a/AutoBuild/nrdlist.py b/AutoBuild/nrdlist.py index f2cfb4c..b25ada7 100644 --- a/AutoBuild/nrdlist.py +++ b/AutoBuild/nrdlist.py @@ -1,26 +1,27 @@ -import signal -from typing import Dict - -import httpx -import arrow -from base64 import b64encode -import pathlib -import logging import asyncio +import csv +import gzip +import logging +import os +import pathlib +from base64 import b64encode +from io import BytesIO, StringIO +from typing import Dict, List from zipfile import ZipFile, BadZipfile -from io import BytesIO + +import arrow +import httpx logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class Downloader: +class Phase1: def __init__(self): - self.base_url = "https://www.whoisds.com//whois-database/newly-registered-domains/{args}/nrd" - self.base_path = pathlib.Path("nrd") - self.data: Dict[str, BytesIO] = {} - if not self.base_path.exists(): - self.base_path.mkdir() + self.base_url = os.getenv("PHASE1_URL") + if not self.base_url: + raise ValueError("PHASE1_URL not set") + self.data: Dict[str, List[str]] = {} async def fetch(self, date: arrow.Arrow) -> bool: logger.info("Downloading: %s", date.format("YYYY-MM-DD")) @@ -36,37 +37,107 @@ class Downloader: try: with ZipFile(zip_file, "r") as zip_obj: # print(zip_obj.read('domain-names.txt')) - self.data[date.format("YYYY-MM-DD")] = zip_obj.read( - "domain-names.txt" + self.data[date.format("YYYY-MM-DD")] = ( + zip_obj.read("domain-names.txt").decode().splitlines() ) except BadZipfile: logger.error("Bad Zipfile: %s", url) return False return True - async def write(self): - sort_date = sorted(self.data.keys(), reverse=True) - accumulate = "" - for date in range(len(sort_date)): - accumulate += self.data[sort_date[date]].decode() - accumulate = "\n".join(sorted(set(accumulate.split("\n")))) - self.base_path.joinpath(f"past-{date+1}day.txt").write_bytes( - accumulate.encode() - ) - - def run(self): - loop = asyncio.get_event_loop() - loop.add_signal_handler(signal.SIGILL, loop.stop) - loop.add_signal_handler(signal.SIGINT, loop.stop) + async def run(self, loop: asyncio.AbstractEventLoop): today = arrow.utcnow() - task = [] - - for i in range(1, 30, 5): + for i in range(1, 31, 5): + task = [] for j in range(i, i + 5): date = today.shift(days=-j) task.append(loop.create_task(self.fetch(date))) - loop.run_until_complete(asyncio.gather(*task)) - asyncio.run(self.write()) + await asyncio.gather(*task) + + +class Phase2: + def __init__(self): + self.base_url = os.getenv("PHASE2_URL") + if not self.base_url: + raise ValueError("PHASE2_URL not set") + self.data: Dict[str, List[str]] = {} + + async def fetch(self): + now = arrow.utcnow() + async with httpx.AsyncClient() as client: + for files in ["nrd-1m.csv", "nrd-1w.csv"]: + url = self.base_url + files + logger.info("Downloading: %s", files) + r = await client.get(url) + if r.status_code != 200: + logger.error("Download failed: %s", files) + return False + if files == "nrd-1m.csv": + self.data[now.shift(months=-1).date().strftime("%Y-%m-%d")] = ( + BytesIO(r.content).getvalue().decode().splitlines() + ) + else: + self.data[now.shift(weeks=-1).date().strftime("%Y-%m-%d")] = ( + BytesIO(r.content).getvalue().decode().splitlines() + ) + + async def run(self): + await self.fetch() + + +class Phase3: + def __init__(self): + self.base_url = os.getenv("PHASE3_URL") + if not self.base_url: + raise ValueError("PHASE3_URL not set") + self.data: Dict[str, List[str]] = {} + + async def fetch(self): + async with httpx.AsyncClient() as client: + logger.info("Downloading: %s", self.base_url) + r = await client.get(self.base_url) + if r.status_code != 200: + logger.error("Download failed: %s", self.base_url) + return False + + with gzip.GzipFile(fileobj=BytesIO(r.content), mode="rb") as f: + raw_data = BytesIO(f.read()).getvalue().decode() + + data_file = StringIO(raw_data) + reader = csv.DictReader(data_file) + for row in reader: + if row["create_date"]: + self.data.setdefault(row["create_date"], []).append(row["domain_name"]) + + async def run(self): + await self.fetch() + if __name__ == "__main__": - Downloader().run() + loop = asyncio.get_event_loop() + ph1 = Phase1() + ph2 = Phase2() + ph3 = Phase3() + + task = [ph1.run(loop), ph2.run(), ph3.run()] + + loop.run_until_complete(asyncio.gather(*task)) + logger.info("Download Complete, Now writing") + base_path = pathlib.Path("nrd") + if not base_path.exists(): + base_path.mkdir() + + combined_data: Dict[str, set] = {} + for data in [ph1.data, ph2.data, ph3.data]: + for key, value in data.items(): + if key not in combined_data: + combined_data[key] = set(value) + else: + combined_data[key].update(value) + + sort_date = sorted(combined_data.keys(), reverse=True) + accumulate = "" + for date in range(len(sort_date)): + accumulate += "\n".join(combined_data[sort_date[date]]) + accumulate = "\n".join(sorted(set(accumulate.split("\n")))) + base_path.joinpath(f"past-{date + 1}day.txt").write_bytes(accumulate.encode()) From 9edeaec9cc55a29af495962b5eaa76aa04859d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B8=A2=E4=BD=8E=E5=90=B8?= Date: Sat, 13 Apr 2024 22:19:46 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9A=A1=20=E6=9B=B4=E6=96=B0=20NRD=20?= =?UTF-8?q?=E6=B8=85=E5=96=AE=20leading=20zero?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AutoBuild/nrdlist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AutoBuild/nrdlist.py b/AutoBuild/nrdlist.py index b25ada7..ee271f4 100644 --- a/AutoBuild/nrdlist.py +++ b/AutoBuild/nrdlist.py @@ -140,4 +140,4 @@ if __name__ == "__main__": for date in range(len(sort_date)): accumulate += "\n".join(combined_data[sort_date[date]]) accumulate = "\n".join(sorted(set(accumulate.split("\n")))) - base_path.joinpath(f"past-{date + 1}day.txt").write_bytes(accumulate.encode()) + base_path.joinpath(f"past-{(date + 1):02d}day.txt").write_bytes(accumulate.encode()) From e105961304c27f43167f3402cdb2b6a1f01a4772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B8=A2=E4=BD=8E=E5=90=B8?= Date: Sat, 13 Apr 2024 23:26:26 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9A=A1=20dict=20=E4=B8=AD=E6=93=8D?= =?UTF-8?q?=E4=BD=9C=20sort=20=E5=8A=A0=E9=80=9F=E6=95=B4=E5=80=8B?= =?UTF-8?q?=E9=81=8E=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AutoBuild/nrdlist.py | 57 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/AutoBuild/nrdlist.py b/AutoBuild/nrdlist.py index ee271f4..995e39a 100644 --- a/AutoBuild/nrdlist.py +++ b/AutoBuild/nrdlist.py @@ -113,31 +113,64 @@ class Phase3: await self.fetch() -if __name__ == "__main__": - loop = asyncio.get_event_loop() - ph1 = Phase1() - ph2 = Phase2() - ph3 = Phase3() +class Phase4: + def __init__(self): + self.base_url = os.getenv("PHASE4_URL") + if not self.base_url: + raise ValueError("PHASE4_URL not set") + self.data: Dict[str, List[str]] = {} - task = [ph1.run(loop), ph2.run(), ph3.run()] + async def fetch(self): + now = arrow.utcnow() + async with httpx.AsyncClient() as client: + logger.info("Downloading: %s", self.base_url) + r = await client.get(self.base_url) + if r.status_code != 200: + logger.error("Download failed: %s", self.base_url) + return False + date = now.shift(days=-7).date().strftime("%Y-%m-%d") + self.data[date] = r.text.splitlines()[2:-2] - loop.run_until_complete(asyncio.gather(*task)) - logger.info("Download Complete, Now writing") + async def run(self): + await self.fetch() + + +async def write_files(data: List[dict]): base_path = pathlib.Path("nrd") if not base_path.exists(): base_path.mkdir() combined_data: Dict[str, set] = {} - for data in [ph1.data, ph2.data, ph3.data]: + for data in [ph4.data]: for key, value in data.items(): if key not in combined_data: combined_data[key] = set(value) else: combined_data[key].update(value) - sort_date = sorted(combined_data.keys(), reverse=True) + sort_date = sorted(combined_data.keys(), reverse=True)[:30] accumulate = "" for date in range(len(sort_date)): accumulate += "\n".join(combined_data[sort_date[date]]) - accumulate = "\n".join(sorted(set(accumulate.split("\n")))) - base_path.joinpath(f"past-{(date + 1):02d}day.txt").write_bytes(accumulate.encode()) + # accumulate = "\n".join(sorted(set(accumulate.split("\n")))) + base_path.joinpath(f"past-{(date + 1):02d}day.txt").write_bytes( + accumulate.encode() + ) + + +if __name__ == "__main__": + import time + start = time.time() + loop = asyncio.get_event_loop() + ph1 = Phase1() + ph2 = Phase2() + ph3 = Phase3() + ph4 = Phase4() + + task = [ph1.run(loop), ph2.run(), ph3.run(), ph4.run()] + + loop.run_until_complete(asyncio.gather(*task)) + logger.info("Download Complete, Now writing") + loop.run_until_complete(write_files([ph1.data, ph2.data, ph3.data, ph4.data])) + end = time.time() - start + logger.info(f"Time taken: {end:.2f} seconds") From 87b58077af6747ded84c0a07e23a55cca493b2f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B8=A2=E4=BD=8E=E5=90=B8?= Date: Sat, 13 Apr 2024 23:41:01 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E2=9A=A1=20=E6=96=B0=E5=A2=9E=E9=87=8D?= =?UTF-8?q?=E8=A9=A6=E6=A9=9F=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AutoBuild/nrdlist.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/AutoBuild/nrdlist.py b/AutoBuild/nrdlist.py index 995e39a..de056f7 100644 --- a/AutoBuild/nrdlist.py +++ b/AutoBuild/nrdlist.py @@ -132,16 +132,23 @@ class Phase4: self.data[date] = r.text.splitlines()[2:-2] async def run(self): - await self.fetch() + for i in range(5): + try: + await self.fetch() + except httpx.ReadTimeout: + logger.error("Phase4: Timeout, retrying") + continue + finally: + break -async def write_files(data: List[dict]): +async def write_files(datalist: List[Dict[str, List[str]]]): base_path = pathlib.Path("nrd") if not base_path.exists(): base_path.mkdir() combined_data: Dict[str, set] = {} - for data in [ph4.data]: + for data in datalist: for key, value in data.items(): if key not in combined_data: combined_data[key] = set(value) @@ -168,7 +175,6 @@ if __name__ == "__main__": ph4 = Phase4() task = [ph1.run(loop), ph2.run(), ph3.run(), ph4.run()] - loop.run_until_complete(asyncio.gather(*task)) logger.info("Download Complete, Now writing") loop.run_until_complete(write_files([ph1.data, ph2.data, ph3.data, ph4.data])) From 43b5c0aa8a5c1e93b5aaa89cde6ecfff31527d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B8=A2=E4=BD=8E=E5=90=B8?= Date: Sun, 14 Apr 2024 00:03:33 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=93=9D=20=E6=9B=B4=E6=96=B0=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3da0f95..f855ed2 100644 --- a/README.md +++ b/README.md @@ -52,11 +52,17 @@ https://t.me/adblock_tw - -| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 | -| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- | -| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) | -| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) | -| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) | +> 從網路上羅列的 NRD 清單,經過整理的版本,NRD (Newly Registered Domain +) 意思是近期註冊的網域,通常新註冊的網域有較高的風險,經常為詐騙集團所用,此清單提供過去 1 天致 30 天的清單 +- (過去 1 天,hosts 格式) +- (過去 7 天,adblock 格式) + + +| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 | NRD 清單(過去1天) | +| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------- | +| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_abp.txt) | +| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_hosts.txt) | +| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_domains.txt) |