Files
LowTechFilter/AutoBuild/builder.py
tdc 98fadb3a90 🤡 165
2026-02-03 09:37:01 +08:00

318 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
from datetime import datetime, timedelta, timezone
import requests
from glob import glob
import asyncio
import time
from urllib.parse import urlparse
# Domain 驗證相關
DOMAIN_PATTERN = re.compile(r"^(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$", re.IGNORECASE)
filterlist = {
"abp": [
"experimental.txt",
"filter.txt",
"PureView/news.txt",
"PureView/news_mobile.txt",
],
"hosts": ["hosts.txt", "nofarm.txt", "TWNIC-RPZ.txt", "TW165.txt"],
}
url = "https://filter.futa.gg/"
tz = timezone(timedelta(hours=+8))
today = datetime.now(tz).date()
WHITELIST = [
"google.com",
"line.me",
"apple.com",
]
# 新增 nrd 清單
for files in glob("nrd/past-*.txt"):
filterlist["hosts"].append(files)
class HEAD:
abp: str = (
"[Adblock Plus]\n"
"! Title: LowTechFilter {name}\n"
"! Version: {version}\n"
"! Expires: 1 hour\n"
"! Homepage: https://t.me/AdBlock_TW\n"
"! ----------------------------------------------------------------------\n"
)
hosts: str = (
"! FutaHosts\n"
"! Title: LowTechFilter {name}\n"
"! URL: <https://github.com/FutaGuard/LowTechFilter>\n"
"! Version: {version}\n"
"! --------------------------------------------------\n"
)
def is_valid_domain(hostname: str) -> bool:
"""檢查是否為合法的完整 domain"""
if not hostname or len(hostname) > 253:
return False
if hostname.startswith(".") or hostname.endswith("."):
return False
if ".." in hostname:
return False
if not DOMAIN_PATTERN.match(hostname):
return False
parts = hostname.split(".")
if len(parts) < 2:
return False
tld = parts[-1]
if len(tld) < 2 or not tld.isalpha():
return False
return True
def strip_bang_comments(text: str) -> str:
lines = text.splitlines(keepends=True)
return "".join(line for line in lines if not line.lstrip().startswith("!"))
def update_version(filename: str) -> str:
pattern = r"(?<=Version: )(\d+\.\d+\.)(\d+)"
newversion = ""
r = requests.get(url + filename)
first = None
version = None
if r.status_code != 200:
pass
else:
first = "\n".join(r.text.splitlines()[:5])
try:
version = re.findall(pattern, first, re.MULTILINE)[0]
except:
# https://www.ptt.cc/bbs/Battlegirlhs/M.1506615677.A.1A4.html
version = ("2017.0929.", "1")
dt = datetime.strptime(version[0], "%Y.%m%d.").date()
newversion = today.strftime("%Y.%m%d.")
if dt != today:
newversion += "1"
else:
newversion += str(int(version[1]) + 1)
return newversion
# make hosts formats
async def to_hosts(filename: str, data: str, newversion: str):
data = data.splitlines()
newdata = "\n".join(data)
name = filename.split(".txt")[0].split("_")[0]
heads: str = HEAD().__getattribute__("hosts")
newhead = heads.format(name=name + " hosts", version=newversion)
newfilename = name + "_hosts.txt" if name != "hosts" else "hosts.txt"
with open(newfilename, "w") as output:
if name == "hosts":
pattern = r"(?<=^\|\|)\S+\.\S{2,}(?=\^)"
newoutput = ""
for e in re.findall(pattern, newdata, re.MULTILINE):
if "*" not in e:
newoutput += "0.0.0.0 " + e + "\n"
elif name == "TW165":
# 只處理沒有 path 的 domain
domains = []
for line in data:
parsed = parse_url(line)
domain = parsed['domain']
if (
not parsed['has_path']
and not parsed['is_ip']
and not is_whitelisted_domain(domain)
):
domains.append(parsed['domain'])
newoutput = "\n".join("0.0.0.0 " + d for d in sorted(set(domains)))
else:
newoutput = "\n".join("0.0.0.0 " + e for e in data)
output.write(newhead)
output.write(newoutput)
async def to_abp(filename: str, data: str, newversion: str):
data = data.splitlines()
newdata = "\n".join(data)
name = filename.split(".txt")[0].split("_")[0]
heads: str = HEAD().__getattribute__("abp")
newhead = heads.format(name=name + " abp", version=newversion)
with open(name + "_abp.txt", "w") as output:
if name == "hosts":
output.write(newhead + newdata)
elif name == "TW165":
# 排除 IP保留完整 URL (domain + path)
rules = []
for line in data:
parsed = parse_url(line)
if parsed['is_ip']:
continue
domain = parsed['domain']
if is_whitelisted_domain(domain):
continue
rules.append(f"||{parsed['full']}^")
newoutput = "\n".join(sorted(set(rules)))
output.write(newhead)
output.write(newoutput)
else:
newoutput = "\n".join(f"||{e}^" for e in data)
output.write(newhead)
output.write(newoutput)
def parse_url(url: str) -> dict:
"""解析 URL返回 domain 和 path 信息"""
# 移除 http:// 或 https://
url = url.strip()
if url.startswith('http://'):
url = url[7:]
elif url.startswith('https://'):
url = url[8:]
# 分離 domain 和 path
if '/' in url:
parts = url.split('/', 1)
domain = parts[0]
path = '/' + parts[1]
else:
domain = url
path = ''
# 檢查是否為 IP
is_ip = bool(re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain))
# 檢查是否為合法 domain
is_valid = is_valid_domain(domain) if not is_ip else True
return {
'domain': domain,
'path': path,
'has_path': bool(path),
'is_ip': is_ip,
'is_valid': is_valid,
'full': domain + path
}
def is_whitelisted_domain(domain: str) -> bool:
if not domain:
return False
domain = domain.lower()
for item in WHITELIST:
entry = item.lower()
if domain == entry or domain.endswith('.' + entry):
return True
return False
async def to_pure_domain(filename: str, data: str):
data = data.splitlines()
newdata = "\n".join(data)
name = filename.split(".txt")[0].split("_")[0]
with open(name + "_domains.txt", "w") as output:
if name == "hosts":
pattern = r"(?<=^\|\|)\S+\.\S{2,}(?=\^)"
newoutput = "\n".join(re.findall(pattern, newdata, re.MULTILINE))
elif name == "TW165":
# 只保留沒有 path 的 domain與 hosts 格式一致
domains = []
for line in data:
line = line.strip()
if not line:
continue
parsed = parse_url(line)
domain = parsed['domain']
if (
not parsed['has_path']
and not parsed['is_ip']
and parsed['is_valid']
and not is_whitelisted_domain(domain)
):
domains.append(parsed['domain'])
newoutput = "\n".join(sorted(set(domains)))
else:
newoutput = "\n".join(data)
output.write(newoutput)
async def run():
import time
# task = []
for category in filterlist:
for filename in filterlist[category]:
newversion = update_version(filename)
with open(f"{filename}", "r") as files:
data = strip_bang_comments(files.read())
with open(f"{filename}", "w") as output:
heads: str = HEAD().__getattribute__(category)
newhead = heads.format(
name=filename.split(".")[0]
.replace("_", " ")
.replace("/", " ")
.title(),
version=newversion,
)
output.write(newhead)
output.write(data)
# hide farm site from google 轉換 abp
if filename == "nofarm.txt":
domain_list = ""
for domains in data.splitlines():
if not domains.startswith("!"):
domain = domains[2:-1]
domain_list += 'google.*##div.g:has(div[data-hveid] a[href*="{domain}"])\n'.format(
domain=domain
)
heads: str = HEAD().__getattribute__("abp")
newhead = heads.format(
name="hide farm content from google", version=newversion
)
with open("hide_farm_from_search.txt", "w") as f:
f.write(newhead + domain_list)
if filename == "TW165.txt":
# 生成 redirect 格式:只處理沒有 path 的 domain
newfilename = "TW165_redirect.txt"
heads: str = HEAD().__getattribute__("abp")
newhead = heads.format(name="TW165 redirect", version=newversion)
with open(newfilename, "w") as f:
f.write(newhead)
domains = []
for line in data.splitlines():
parsed = parse_url(line)
domain = parsed['domain']
if (
not parsed['has_path']
and not parsed['is_ip']
and not is_whitelisted_domain(domain)
):
domains.append(domain)
f.write(
"\n".join(
f"||{d}^$dnsrewrite=NOERROR;A;34.102.218.71"
for d in sorted(set(domains))
)
)
if category == "hosts":
task = [
asyncio.create_task(to_pure_domain(filename, data)),
asyncio.create_task(to_abp(filename, data, newversion)),
asyncio.create_task(to_hosts(filename, data, newversion)),
]
await asyncio.gather(*task)
if __name__ == "__main__":
asyncio.run(run())