mirror of
https://github.com/FutaGuard/LowTechFilter.git
synced 2025-06-21 05:21:02 +08:00
commit
104b6beb60
@ -1,26 +1,27 @@
|
||||
import signal
|
||||
from typing import Dict
|
||||
|
||||
import httpx
|
||||
import arrow
|
||||
from base64 import b64encode
|
||||
import pathlib
|
||||
import logging
|
||||
import asyncio
|
||||
import csv
|
||||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from base64 import b64encode
|
||||
from io import BytesIO, StringIO
|
||||
from typing import Dict, List
|
||||
from zipfile import ZipFile, BadZipfile
|
||||
from io import BytesIO
|
||||
|
||||
import arrow
|
||||
import httpx
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Downloader:
|
||||
class Phase1:
|
||||
def __init__(self):
|
||||
self.base_url = "https://www.whoisds.com//whois-database/newly-registered-domains/{args}/nrd"
|
||||
self.base_path = pathlib.Path("nrd")
|
||||
self.data: Dict[str, BytesIO] = {}
|
||||
if not self.base_path.exists():
|
||||
self.base_path.mkdir()
|
||||
self.base_url = os.getenv("PHASE1_URL")
|
||||
if not self.base_url:
|
||||
raise ValueError("PHASE1_URL not set")
|
||||
self.data: Dict[str, List[str]] = {}
|
||||
|
||||
async def fetch(self, date: arrow.Arrow) -> bool:
|
||||
logger.info("Downloading: %s", date.format("YYYY-MM-DD"))
|
||||
@ -36,37 +37,146 @@ class Downloader:
|
||||
try:
|
||||
with ZipFile(zip_file, "r") as zip_obj:
|
||||
# print(zip_obj.read('domain-names.txt'))
|
||||
self.data[date.format("YYYY-MM-DD")] = zip_obj.read(
|
||||
"domain-names.txt"
|
||||
self.data[date.format("YYYY-MM-DD")] = (
|
||||
zip_obj.read("domain-names.txt").decode().splitlines()
|
||||
)
|
||||
except BadZipfile:
|
||||
logger.error("Bad Zipfile: %s", url)
|
||||
return False
|
||||
return True
|
||||
|
||||
async def write(self):
|
||||
sort_date = sorted(self.data.keys(), reverse=True)
|
||||
accumulate = ""
|
||||
for date in range(len(sort_date)):
|
||||
accumulate += self.data[sort_date[date]].decode()
|
||||
accumulate = "\n".join(sorted(set(accumulate.split("\n"))))
|
||||
self.base_path.joinpath(f"past-{date+1}day.txt").write_bytes(
|
||||
accumulate.encode()
|
||||
)
|
||||
|
||||
def run(self):
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.add_signal_handler(signal.SIGILL, loop.stop)
|
||||
loop.add_signal_handler(signal.SIGINT, loop.stop)
|
||||
async def run(self, loop: asyncio.AbstractEventLoop):
|
||||
today = arrow.utcnow()
|
||||
task = []
|
||||
|
||||
for i in range(1, 30, 5):
|
||||
for i in range(1, 31, 5):
|
||||
task = []
|
||||
for j in range(i, i + 5):
|
||||
date = today.shift(days=-j)
|
||||
task.append(loop.create_task(self.fetch(date)))
|
||||
loop.run_until_complete(asyncio.gather(*task))
|
||||
asyncio.run(self.write())
|
||||
await asyncio.gather(*task)
|
||||
|
||||
|
||||
class Phase2:
|
||||
def __init__(self):
|
||||
self.base_url = os.getenv("PHASE2_URL")
|
||||
if not self.base_url:
|
||||
raise ValueError("PHASE2_URL not set")
|
||||
self.data: Dict[str, List[str]] = {}
|
||||
|
||||
async def fetch(self):
|
||||
now = arrow.utcnow()
|
||||
async with httpx.AsyncClient() as client:
|
||||
for files in ["nrd-1m.csv", "nrd-1w.csv"]:
|
||||
url = self.base_url + files
|
||||
logger.info("Downloading: %s", files)
|
||||
r = await client.get(url)
|
||||
if r.status_code != 200:
|
||||
logger.error("Download failed: %s", files)
|
||||
return False
|
||||
if files == "nrd-1m.csv":
|
||||
self.data[now.shift(months=-1).date().strftime("%Y-%m-%d")] = (
|
||||
BytesIO(r.content).getvalue().decode().splitlines()
|
||||
)
|
||||
else:
|
||||
self.data[now.shift(weeks=-1).date().strftime("%Y-%m-%d")] = (
|
||||
BytesIO(r.content).getvalue().decode().splitlines()
|
||||
)
|
||||
|
||||
async def run(self):
|
||||
await self.fetch()
|
||||
|
||||
|
||||
class Phase3:
|
||||
def __init__(self):
|
||||
self.base_url = os.getenv("PHASE3_URL")
|
||||
if not self.base_url:
|
||||
raise ValueError("PHASE3_URL not set")
|
||||
self.data: Dict[str, List[str]] = {}
|
||||
|
||||
async def fetch(self):
|
||||
async with httpx.AsyncClient() as client:
|
||||
logger.info("Downloading: %s", self.base_url)
|
||||
r = await client.get(self.base_url)
|
||||
if r.status_code != 200:
|
||||
logger.error("Download failed: %s", self.base_url)
|
||||
return False
|
||||
|
||||
with gzip.GzipFile(fileobj=BytesIO(r.content), mode="rb") as f:
|
||||
raw_data = BytesIO(f.read()).getvalue().decode()
|
||||
|
||||
data_file = StringIO(raw_data)
|
||||
reader = csv.DictReader(data_file)
|
||||
for row in reader:
|
||||
if row["create_date"]:
|
||||
self.data.setdefault(row["create_date"], []).append(row["domain_name"])
|
||||
|
||||
async def run(self):
|
||||
await self.fetch()
|
||||
|
||||
|
||||
class Phase4:
|
||||
def __init__(self):
|
||||
self.base_url = os.getenv("PHASE4_URL")
|
||||
if not self.base_url:
|
||||
raise ValueError("PHASE4_URL not set")
|
||||
self.data: Dict[str, List[str]] = {}
|
||||
|
||||
async def fetch(self):
|
||||
now = arrow.utcnow()
|
||||
async with httpx.AsyncClient() as client:
|
||||
logger.info("Downloading: %s", self.base_url)
|
||||
r = await client.get(self.base_url)
|
||||
if r.status_code != 200:
|
||||
logger.error("Download failed: %s", self.base_url)
|
||||
return False
|
||||
date = now.shift(days=-7).date().strftime("%Y-%m-%d")
|
||||
self.data[date] = r.text.splitlines()[2:-2]
|
||||
|
||||
async def run(self):
|
||||
for i in range(5):
|
||||
try:
|
||||
await self.fetch()
|
||||
except httpx.ReadTimeout:
|
||||
logger.error("Phase4: Timeout, retrying")
|
||||
continue
|
||||
finally:
|
||||
break
|
||||
|
||||
|
||||
async def write_files(datalist: List[Dict[str, List[str]]]):
|
||||
base_path = pathlib.Path("nrd")
|
||||
if not base_path.exists():
|
||||
base_path.mkdir()
|
||||
|
||||
combined_data: Dict[str, set] = {}
|
||||
for data in datalist:
|
||||
for key, value in data.items():
|
||||
if key not in combined_data:
|
||||
combined_data[key] = set(value)
|
||||
else:
|
||||
combined_data[key].update(value)
|
||||
|
||||
sort_date = sorted(combined_data.keys(), reverse=True)[:30]
|
||||
accumulate = ""
|
||||
for date in range(len(sort_date)):
|
||||
accumulate += "\n".join(combined_data[sort_date[date]])
|
||||
# accumulate = "\n".join(sorted(set(accumulate.split("\n"))))
|
||||
base_path.joinpath(f"past-{(date + 1):02d}day.txt").write_bytes(
|
||||
accumulate.encode()
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Downloader().run()
|
||||
import time
|
||||
start = time.time()
|
||||
loop = asyncio.get_event_loop()
|
||||
ph1 = Phase1()
|
||||
ph2 = Phase2()
|
||||
ph3 = Phase3()
|
||||
ph4 = Phase4()
|
||||
|
||||
task = [ph1.run(loop), ph2.run(), ph3.run(), ph4.run()]
|
||||
loop.run_until_complete(asyncio.gather(*task))
|
||||
logger.info("Download Complete, Now writing")
|
||||
loop.run_until_complete(write_files([ph1.data, ph2.data, ph3.data, ph4.data]))
|
||||
end = time.time() - start
|
||||
logger.info(f"Time taken: {end:.2f} seconds")
|
||||
|
16
README.md
16
README.md
@ -52,11 +52,17 @@ https://t.me/adblock_tw
|
||||
|
||||
- <https://filter.futa.gg/TW165_domains.txt>
|
||||
|
||||
| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 |
|
||||
| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) |
|
||||
| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) |
|
||||
| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) |
|
||||
> 從網路上羅列的 NRD 清單,經過整理的版本,NRD (Newly Registered Domain
|
||||
) 意思是近期註冊的網域,通常新註冊的網域有較高的風險,經常為詐騙集團所用,此清單提供過去 1 天致 30 天的清單
|
||||
- <https://filter.futa.gg/ndr/past-01day_hosts.txt> (過去 1 天,hosts 格式)
|
||||
- <https://filter.futa.gg/ndr/past-07day_abp.txt> (過去 7 天,adblock 格式)
|
||||
|
||||
|
||||
| hosts 清單一覽 | LowTechHost | TW165 台灣反詐騙 | TW RPZ 阻止解析清單 | NoFarm 農場文清單 | NRD 清單(過去1天) |
|
||||
| -------------- | ---------------------------------------------------- | ---------------------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------- |
|
||||
| Adblock 語法 | [訂閱連結](https://filter.futa.gg/hosts_abp.txt) | [訂閱連結](https://filter.futa.gg/TW165_abp.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_abp.txt) | [訂閱連結](https://filter.futa.gg/nofarm_abp.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_abp.txt) |
|
||||
| hosts | [訂閱連結](https://filter.futa.gg/hosts.txt) | [訂閱連結](https://filter.futa.gg/TW165_hosts.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_hosts.txt) | [訂閱連結](https://filter.futa.gg/nofarm_hosts.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_hosts.txt) |
|
||||
| 純網域 | [訂閱連結](https://filter.futa.gg/hosts_domains.txt) | [訂閱連結](https://filter.futa.gg/TW165_domains.txt) | [訂閱連結](https://filter.futa.gg/TWNIC-RPZ_domains.txt) | [訂閱連結](https://filter.futa.gg/nofarm_domains.txt) | [訂閱連結](https://filter.futa.gg/nrd/past-01day_domains.txt) |
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user