diff --git a/main.py b/main.py index 10b35f3..28e9c74 100644 --- a/main.py +++ b/main.py @@ -42,7 +42,7 @@ def has_ip(line: str) -> bool: ipv4_re = re.compile(r'((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}') ipv6_re = re.compile( r'(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))') - return bool(ipv4_re.findall(line)) or bool(ipv6_re.findall(line)) + return bool(ipv4_re.search(line)) or bool(ipv6_re.search(line)) def is_exception(line: str) -> bool: @@ -127,7 +127,7 @@ def gfwlist_line_converter(line: str) -> str: if m in line: return invalid_rule_return() # 移除非asci字符 - if re.match(r'%\w\w', line): + if re.search(r'%\w\w', line): return invalid_rule_return() line = convert_asterisk(line) @@ -139,10 +139,34 @@ def gfwlist_line_converter(line: str) -> str: return line -def get_gfwlist_hosts() -> set[str]: +def is_valid_hostname(domain: str) -> bool: + # https://stackoverflow.com/questions/1418423/the-hostname-regex + domain_re = re.compile( + r'^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\.?$') + return bool(domain_re.match(domain)) + + +def hosts_deduplicate(hosts: list[str]) -> list[str]: + hosts = list(set(hosts)) + for h in hosts.copy(): + if not is_valid_hostname(h): + logging.warning('{host} is invalid!'.format(host=h)) + hosts.remove(h) + + hosts_copy = hosts.copy() + for v in hosts_copy: + for k in hosts_copy: + if k != v and k.endswith('.' + v): + logging.debug('found duplicate: {k} {v}'.format(k=k, v=v)) + hosts.remove(k) + + return hosts + + +def get_gfwlist_hosts() -> list[str]: gfwlist_text = get_gfwlist_text() gfwlist_lines = gfwlist_text.splitlines() - return set( + gfwlist_hosts = list( filter( lambda line: line != "", map( @@ -151,6 +175,9 @@ def get_gfwlist_hosts() -> set[str]: ) ) ) + gfwlist_hosts = hosts_deduplicate(gfwlist_hosts) + logging.info('found {num} gfwlist host'.format(num=len(gfwlist_hosts))) + return sorted(gfwlist_hosts) def get_dnsmasq_text() -> str: @@ -158,7 +185,7 @@ def get_dnsmasq_text() -> str: map( lambda host: "server=/{host}/{dns_ip}#{dns_port}".format(host=host, dns_ip=PROXY_DNS_IP, dns_port=PROXY_DNS_PORT), - sorted(list(get_gfwlist_hosts())) + get_gfwlist_hosts() ) ) return '\n'.join(rule_list)