add hosts_deduplicate function

This commit is contained in:
bgme 2023-01-19 18:40:30 +08:00
parent aea788adf3
commit 183e073927

37
main.py
View file

@ -42,7 +42,7 @@ def has_ip(line: str) -> bool:
ipv4_re = re.compile(r'((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}')
ipv6_re = re.compile(
r'(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))')
return bool(ipv4_re.findall(line)) or bool(ipv6_re.findall(line))
return bool(ipv4_re.search(line)) or bool(ipv6_re.search(line))
def is_exception(line: str) -> bool:
@ -127,7 +127,7 @@ def gfwlist_line_converter(line: str) -> str:
if m in line:
return invalid_rule_return()
# 移除非asci字符
if re.match(r'%\w\w', line):
if re.search(r'%\w\w', line):
return invalid_rule_return()
line = convert_asterisk(line)
@ -139,10 +139,34 @@ def gfwlist_line_converter(line: str) -> str:
return line
def get_gfwlist_hosts() -> set[str]:
def is_valid_hostname(domain: str) -> bool:
# https://stackoverflow.com/questions/1418423/the-hostname-regex
domain_re = re.compile(
r'^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\.?$')
return bool(domain_re.match(domain))
def hosts_deduplicate(hosts: list[str]) -> list[str]:
hosts = list(set(hosts))
for h in hosts.copy():
if not is_valid_hostname(h):
logging.warning('{host} is invalid!'.format(host=h))
hosts.remove(h)
hosts_copy = hosts.copy()
for v in hosts_copy:
for k in hosts_copy:
if k != v and k.endswith('.' + v):
logging.debug('found duplicate: {k} {v}'.format(k=k, v=v))
hosts.remove(k)
return hosts
def get_gfwlist_hosts() -> list[str]:
gfwlist_text = get_gfwlist_text()
gfwlist_lines = gfwlist_text.splitlines()
return set(
gfwlist_hosts = list(
filter(
lambda line: line != "",
map(
@ -151,6 +175,9 @@ def get_gfwlist_hosts() -> set[str]:
)
)
)
gfwlist_hosts = hosts_deduplicate(gfwlist_hosts)
logging.info('found {num} gfwlist host'.format(num=len(gfwlist_hosts)))
return sorted(gfwlist_hosts)
def get_dnsmasq_text() -> str:
@ -158,7 +185,7 @@ def get_dnsmasq_text() -> str:
map(
lambda host: "server=/{host}/{dns_ip}#{dns_port}".format(host=host, dns_ip=PROXY_DNS_IP,
dns_port=PROXY_DNS_PORT),
sorted(list(get_gfwlist_hosts()))
get_gfwlist_hosts()
)
)
return '\n'.join(rule_list)