From db992ca92ab2a2a03fbe62936ed40d9074727a57 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 12 Sep 2025 14:59:44 +0300 Subject: [PATCH 1/6] gh-138907: Support RFC 9309 in robotparser --- Lib/test/test_robotparser.py | 321 +++++++++++++++++++++++++++++++---- Lib/urllib/robotparser.py | 209 +++++++++++++++-------- 2 files changed, 428 insertions(+), 102 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index e33723cc70c877..6483d4d071fbc6 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -15,14 +15,18 @@ class BaseRobotTest: good = [] bad = [] site_maps = None + expected_output = None def __init_subclass__(cls): super().__init_subclass__() # Remove tests that do nothing. - if not cls.good: - cls.test_good_urls = None - if not cls.bad: - cls.test_bad_urls = None + if issubclass(cls, unittest.TestCase): + if not cls.good: + cls.test_good_urls = None + if not cls.bad: + cls.test_bad_urls = None + if cls.expected_output is None: + cls.test_string_formatting = None def setUp(self): lines = io.StringIO(self.robots_txt).readlines() @@ -50,6 +54,8 @@ def test_bad_urls(self): def test_site_maps(self): self.assertEqual(self.parser.site_maps(), self.site_maps) + def test_string_formatting(self): + self.assertEqual(str(self.parser), self.expected_output) class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ @@ -61,6 +67,56 @@ class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): good = ['/', '/test.html'] bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] +class SimpleExampleTest(BaseRobotTest, unittest.TestCase): + # Example from RFC 9309, section 5.1. + robots_txt = """\ +User-Agent: * +Disallow: *.gif$ +Disallow: /example/ +Allow: /publications/ + +User-Agent: foobot +Disallow:/ +Allow:/example/page.html +Allow:/example/allowed.gif + +User-Agent: barbot +User-Agent: bazbot +Disallow: /example/page.html + +User-Agent: quxbot + """ + good = [ + '/', '/publications/', + ('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'), + ('barbot', '/'), ('barbot', '/example/'), + ('barbot', '/example/allowed.gif'), + ('barbot', '/example/disallowed.gif'), + ('barbot', '/publications/'), + ('barbot', '/publications/allowed.gif'), + ('bazbot', '/'), ('bazbot', '/example/'), + ('bazbot', '/example/allowed.gif'), + ('bazbot', '/example/disallowed.gif'), + ('bazbot', '/publications/'), + ('bazbot', '/publications/allowed.gif'), + ('quxbot', '/'), ('quxbot', '/example/'), + ('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'), + ('quxbot', '/example/disallowed.gif'), + ('quxbot', '/publications/'), + ('quxbot', '/publications/allowed.gif'), + ] + bad = [ + '/example/', '/example/page.html', '/example/allowed.gif', + '/example/disallowed.gif', + '/publications/allowed.gif', + ('foobot', '/'), ('foobot', '/example/'), + ('foobot', '/example/disallowed.gif'), + ('foobot', '/publications/'), + ('foobot', '/publications/allowed.gif'), + ('barbot', '/example/page.html'), + ('bazbot', '/example/page.html'), + ] + class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ @@ -137,6 +193,7 @@ def test_request_rate(self): class EmptyFileTest(BaseRequestRateTest, unittest.TestCase): robots_txt = '' good = ['/foo'] + expected_output = '' class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): @@ -221,17 +278,185 @@ class UserAgentGoogleMobileTest(UserAgentOrderingTest): agent = 'Googlebot-Mobile' -class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): - # Google also got the order wrong. You need - # to specify the URLs from more specific to more general +class LongestMatchTest(BaseRobotTest, unittest.TestCase): + # Based on example from RFC 9309, section 5.2. robots_txt = """\ -User-agent: Googlebot -Allow: /folder1/myfile.html -Disallow: /folder1/ +User-agent: * +Allow: /example/page/ +Disallow: /example/page/disallowed.gif +Allow: /example/ """ - agent = 'googlebot' - good = ['/folder1/myfile.html'] - bad = ['/folder1/anotherfile.html'] + good = ['/example/', '/example/page/'] + bad = ['/example/page/disallowed.gif'] + + +class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Allow: /example/page/ +Disallow: *.gif +Allow: /example/ + """ + good = ['/example/', '/example/page/'] + bad = ['/example/page/disallowed.gif', '/x.gif'] + + +class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Disallow: /spam +Allow: /spam +Disallow: /spam + """ + good = ['/spam', '/spam/'] + + +class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Disallow: /spam +Allow: /spam$ +Disallow: /spam +Disallow: /eggs$ +Allow: /eggs +Disallow: /eggs$ + """ + good = ['/spam', '/eggs', '/eggs/'] + bad = ['/spam/'] + + +class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Disallow: /spam +Allow: *am +Disallow: /spam +Disallow: *gs +Allow: /eggs +Disallow: *gs + """ + good = ['/spam', '/eggs', '/spam/', '/eggs/'] + + +class MergeGroupsTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: spambot +Disallow: /some/path + +User-agent: spambot +Disallow: /another/path + """ + agent = 'spambot' + bad = ['/some/path', '/another/path'] + + +class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: spambot +Disallow: /some/path +User-agent: eggsbot +Disallow: /another/path + """ + good = [('spambot', '/'), ('spambot', '/another/path'), + ('eggsbot', '/'), ('eggsbot', '/some/path')] + bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')] + expected_output = """\ +User-agent: spambot +Disallow: /some/path + +User-agent: eggsbot +Disallow: /another/path\ +""" + +class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: spambot + +User-agent: eggsbot +Disallow: /some/path + +Disallow: /another/path + """ + good = [('spambot', '/'), ('eggsbot', '/')] + bad = [ + ('spambot', '/some/path'), ('spambot', '/another/path'), + ('eggsbot', '/some/path'), ('eggsbot', '/another/path'), + ] + expected_output = """\ +User-agent: spambot +User-agent: eggsbot +Disallow: /another/path +Disallow: /some/path\ +""" + + +class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +Disallow: /some/path + +User-agent: * +Disallow: /another/path + """ + good = ['/', '/some/path'] + bad = ['/another/path'] + expected_output = """\ +User-agent: * +Disallow: /another/path\ +""" + + +class EmptyGroupTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Disallow: /some/path + +User-agent: spambot + """ + agent = 'spambot' + good = ['/', '/some/path'] + expected_output = """\ +User-agent: * +Disallow: /some/path + +User-agent: spambot +Allow:\ +""" + + +class WeirdPathTest(BaseRobotTest, unittest.TestCase): + robots_txt = f"""\ +User-agent: * +Disallow: /a$$$ +Disallow: /b$z +Disallow: /c*** +Disallow: /d***z +Disallow: /e*$**$$ +Disallow: /f*$**$$z +Disallow: /g$*$$** +Disallow: /h$*$$**z + """ + good = ['/b', '/bz', '/ax', '/d', '/f', '/fz', '/gx', '/h', '/hz'] + bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy', '/g'] + expected_output = """\ +User-agent: * +Disallow: /c* +Disallow: /d*z +Disallow: /e*$ +Disallow: /a$ +Disallow: /g$\ +""" + + +class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase): + # This test would take many years if use naive translation to regular + # expression (* -> .*). + N = 50 + robots_txt = f"""\ +User-agent: * +Disallow: /{'*a'*N}*b + """ + good = ['/' + 'a'*N + 'a'] + bad = ['/' + 'a'*N + 'b'] class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): @@ -251,19 +476,6 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): '/yet/one/path?name=value&more'] -class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): - # obey first * entry (#4108) - robots_txt = """\ -User-agent: * -Disallow: /some/path - -User-agent: * -Disallow: /another/path - """ - good = ['/another/path'] - bad = ['/some/path'] - - class PercentEncodingTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ User-agent: * @@ -365,17 +577,60 @@ class StringFormattingTest(BaseRobotTest, unittest.TestCase): """ expected_output = """\ -User-agent: cybermapper -Disallow: /some/path - User-agent: * Crawl-delay: 1 Request-rate: 3/15 -Disallow: /cyberworld/map/\ +Disallow: /cyberworld/map/ + +User-agent: cybermapper +Disallow: /some/path\ """ - def test_string_formatting(self): - self.assertEqual(str(self.parser), self.expected_output) + +class ConstructedStringFormattingTest(unittest.TestCase): + def test_empty(self): + parser = urllib.robotparser.RobotFileParser() + self.assertEqual(str(parser), '') + + def test_group_without_rules(self): + parser = urllib.robotparser.RobotFileParser() + entry = urllib.robotparser.Entry() + entry.useragents = ['spambot'] + parser._add_entry(entry) + entry = urllib.robotparser.Entry() + entry.useragents = ['hambot'] + entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)] + parser._add_entry(entry) + entry = urllib.robotparser.Entry() + entry.useragents = ['eggsbot'] + parser._add_entry(entry) + self.assertEqual(str(parser), """\ +User-agent: spambot +Allow: + +User-agent: hambot +Disallow: /ham + +User-agent: eggsbot +Allow:\ +""") + + def test_group_without_user_agent(self): + parser = urllib.robotparser.RobotFileParser() + entry = urllib.robotparser.Entry() + entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)] + parser._add_entry(entry) + entry = urllib.robotparser.Entry() + entry.useragents = ['spambot'] + entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)] + parser._add_entry(entry) + entry = urllib.robotparser.Entry() + entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)] + parser._add_entry(entry) + self.assertEqual(str(parser), """\ +User-agent: spambot +Disallow: /spam\ +""") @unittest.skipUnless( @@ -495,7 +750,7 @@ def test_basic(self): def test_can_fetch(self): self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) - self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) + self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian'))) self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) self.assertTrue(self.parser.can_fetch('*', self.base_url)) diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 4009fd6b58f594..eb85574537bce6 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -21,19 +21,6 @@ RequestRate = collections.namedtuple("RequestRate", "requests seconds") -def normalize(path): - unquoted = urllib.parse.unquote(path, errors='surrogateescape') - return urllib.parse.quote(unquoted, errors='surrogateescape') - -def normalize_path(path): - path, sep, query = path.partition('?') - path = normalize(path) - if sep: - query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query) - path += '?' + query - return path - - class RobotFileParser: """ This class provides a set of methods to read, parse and answer questions about a single robots.txt file. @@ -42,6 +29,7 @@ class RobotFileParser: def __init__(self, url=''): self.entries = [] + self.groups = {} self.sitemaps = [] self.default_entry = None self.disallow_all = False @@ -86,13 +74,13 @@ def read(self): self.parse(raw.decode("utf-8", "surrogateescape").splitlines()) def _add_entry(self, entry): - if "*" in entry.useragents: - # the default entry is considered last - if self.default_entry is None: - # the first default entry wins - self.default_entry = entry - else: - self.entries.append(entry) + self.entries.append(entry) + for agent in entry.useragents: + if agent not in self.groups: + self.groups[agent] = entry + else: + self.groups[agent] = merge_entries(self.groups[agent], entry) + sort_rulelines(self.groups[agent].rulelines) def parse(self, lines): """Parse the input lines from a robots.txt file. @@ -100,6 +88,7 @@ def parse(self, lines): We allow that a user-agent: line is not preceded by one or more blank lines. """ + entries = [] # states: # 0: start state # 1: saw user-agent line @@ -109,14 +98,6 @@ def parse(self, lines): self.modified() for line in lines: - if not line: - if state == 1: - entry = Entry() - state = 0 - elif state == 2: - self._add_entry(entry) - entry = Entry() - state = 0 # remove optional comment and strip line i = line.find('#') if i >= 0: @@ -132,16 +113,23 @@ def parse(self, lines): if state == 2: self._add_entry(entry) entry = Entry() - entry.useragents.append(line[1]) + product_token = line[1] + entry.useragents.append(product_token) state = 1 elif line[0] == "disallow": if state != 0: - entry.rulelines.append(RuleLine(line[1], False)) state = 2 + try: + entry.rulelines.append(RuleLine(line[1], False)) + except ValueError: + pass elif line[0] == "allow": if state != 0: - entry.rulelines.append(RuleLine(line[1], True)) state = 2 + try: + entry.rulelines.append(RuleLine(line[1], True)) + except ValueError: + pass elif line[0] == "crawl-delay": if state != 0: # before trying to convert to int we need to make @@ -164,9 +152,15 @@ def parse(self, lines): # so it doesn't matter where you place it in your file." # Therefore we do not change the state of the parser. self.sitemaps.append(line[1]) - if state == 2: + if state != 0: self._add_entry(entry) + def _find_entry(self, useragent): + for entry in self.groups.values(): + if entry.applies_to(useragent): + return entry + return self.groups.get('*') + def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" if self.disallow_all: @@ -179,43 +173,33 @@ def can_fetch(self, useragent, url): # calls can_fetch() before calling read(). if not self.last_checked: return False - # search for given user agent matches - # the first match counts # TODO: The private API is used in order to preserve an empty query. # This is temporary until the public API starts supporting this feature. parsed_url = urllib.parse._urlsplit(url, '') url = urllib.parse._urlunsplit(None, None, *parsed_url[2:]) - url = normalize_path(url) + url = normalize_uri(url) if not url: url = "/" - for entry in self.entries: - if entry.applies_to(useragent): - return entry.allowance(url) - # try the default entry last - if self.default_entry: - return self.default_entry.allowance(url) - # agent not found ==> access granted - return True + entry = self._find_entry(useragent) + if entry is None: + return True + return entry.allowance(url) def crawl_delay(self, useragent): if not self.mtime(): return None - for entry in self.entries: - if entry.applies_to(useragent): - return entry.delay - if self.default_entry: - return self.default_entry.delay - return None + entry = self._find_entry(useragent) + if entry is None: + return None + return entry.delay def request_rate(self, useragent): if not self.mtime(): return None - for entry in self.entries: - if entry.applies_to(useragent): - return entry.req_rate - if self.default_entry: - return self.default_entry.req_rate - return None + entry = self._find_entry(useragent) + if entry is None: + return None + return entry.req_rate def site_maps(self): if not self.sitemaps: @@ -226,7 +210,7 @@ def __str__(self): entries = self.entries if self.default_entry is not None: entries = entries + [self.default_entry] - return '\n\n'.join(map(str, entries)) + return '\n\n'.join(filter(None, map(str, entries))) class RuleLine: """A rule line is a single "Allow:" (allowance==True) or "Disallow:" @@ -235,14 +219,40 @@ def __init__(self, path, allowance): if path == '' and not allowance: # an empty value means allow all allowance = True - self.path = normalize_path(path) + path = re.sub(r'[*]{2,}', '*', path) + path = re.sub(r'[$][$*]+', '$', path) + path = normalize_pattern(path) + self.fullmatch = path.endswith('$') + path = path.rstrip('$') + if '$' in path: + raise ValueError('$ not at the end of path') + self.matcher = None + if '*' in path: + pattern = re.compile(translite_pattern(path), re.DOTALL) + if self.fullmatch: + self.matcher = pattern.fullmatch + else: + self.matcher = pattern.match + self.path = path self.allowance = allowance def applies_to(self, filename): - return self.path == "*" or filename.startswith(self.path) + if self.matcher is not None: + m = self.matcher(filename) + if m: + return m.end() + 1 + else: + if self.fullmatch: + if filename == self.path: + return len(self.path) + 1 + else: + if filename.startswith(self.path): + return len(self.path) + 1 + return 0 def __str__(self): - return ("Allow" if self.allowance else "Disallow") + ": " + self.path + return (("Allow" if self.allowance else "Disallow") + ": " + self.path + + ('$' if self.fullmatch else '')) class Entry: @@ -254,6 +264,8 @@ def __init__(self): self.req_rate = None def __str__(self): + if not self.useragents: + return '' ret = [] for agent in self.useragents: ret.append(f"User-agent: {agent}") @@ -262,27 +274,86 @@ def __str__(self): if self.req_rate is not None: rate = self.req_rate ret.append(f"Request-rate: {rate.requests}/{rate.seconds}") - ret.extend(map(str, self.rulelines)) + if self.rulelines: + ret.extend(map(str, self.rulelines)) + else: + ret.append("Allow:") return '\n'.join(ret) def applies_to(self, useragent): """check if this entry applies to the specified agent""" + if useragent is None: + return '*' in self.useragents # split the name token and make it lower case useragent = useragent.split("/")[0].lower() for agent in self.useragents: - if agent == '*': - # we have the catch-all agent - return True - agent = agent.lower() - if agent in useragent: - return True + if agent != '*': + agent = agent.lower() + if agent in useragent: + return True return False def allowance(self, filename): """Preconditions: + - rules without wildcards are sorted from longest to shortest, + "Allow" before "Disallow" - our agent applies to this entry - filename is URL encoded""" + best_match = -1 + allowance = True for line in self.rulelines: - if line.applies_to(filename): - return line.allowance - return True + m = line.applies_to(filename) + if m: + if m > best_match: + best_match = m + allowance = line.allowance + elif m == best_match and not allowance: + allowance = line.allowance + # Optimization. + if line.matcher is None and (m or len(line.path) + 1 < best_match): + break + return allowance + + +def normalize(path): + unquoted = urllib.parse.unquote(path, errors='surrogateescape') + return urllib.parse.quote(unquoted, errors='surrogateescape') + +def normalize_uri(path): + path, sep, query = path.partition('?') + path = normalize(path) + if sep: + query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query) + path += '?' + query + return path + +def normalize_pattern(path): + path, sep, query = path.partition('?') + path = re.sub(r'[^*$]+', lambda m: normalize(m[0]), path) + if sep: + query = re.sub(r'[^=&*$]+', lambda m: normalize(m[0]), query) + path += '?' + query + return path + +def translite_pattern(path): + parts = list(map(re.escape, path.split('*'))) + for i in range(1, len(parts)-1): + parts[i] = f'(?>.*?{parts[i]})' + parts[-1] = f'.*{parts[-1]}' + return ''.join(parts) + +def merge_entries(e1, e2): + entry = Entry() + entry.useragents = list(filter(set(e2.useragents).__contains__, e1.useragents)) + entry.rulelines = e1.rulelines + e2.rulelines + entry.delay = e1.delay if e2.delay is None else e2.delay + entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate + return entry + +def sort_rulelines(rulelines): + def sortkey(line): + if line.matcher is not None: + return (True,) + else: + return (False, len(line.path), line.allowance) + rulelines.sort(key=sortkey, reverse=True) From d053f0d68f282dfd673fd7f8ace812faaace925d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 18 Sep 2025 13:42:36 +0300 Subject: [PATCH 2/6] The /robots.txt URI is implicitly allowed. --- Lib/test/test_robotparser.py | 2 +- Lib/urllib/robotparser.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 6483d4d071fbc6..23091b6c33f8d3 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -158,7 +158,7 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): User-agent: * Disallow: / """ - good = [] + good = ['/robots.txt'] bad = ['/cyberworld/map/index.html', '/', '/tmp/'] diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index eb85574537bce6..9378616153279e 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -180,6 +180,9 @@ def can_fetch(self, useragent, url): url = normalize_uri(url) if not url: url = "/" + if url == '/robots.txt': + # The /robots.txt URI is implicitly allowed. + return True entry = self._find_entry(useragent) if entry is None: return True From c944c1a1849a133694e5a3369ffe98c92d0df324 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 18 Sep 2025 18:02:55 +0300 Subject: [PATCH 3/6] Add more weird test cases. --- Lib/test/test_robotparser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 23091b6c33f8d3..4ea6dd03a5ddf1 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -435,8 +435,11 @@ class WeirdPathTest(BaseRobotTest, unittest.TestCase): Disallow: /g$*$$** Disallow: /h$*$$**z """ - good = ['/b', '/bz', '/ax', '/d', '/f', '/fz', '/gx', '/h', '/hz'] - bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy', '/g'] + good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz', + '/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz', + '/h$$$z', '/h$x$$yz'] + bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy', + '/e$$', '/ex$y$', '/g'] expected_output = """\ User-agent: * Disallow: /c* @@ -470,7 +473,8 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): good = ['/some/path', '/some/path?', '/some/path%3Fname=value', '/some/path?name%3Dvalue', '/another/path', '/another/path%3F', - '/yet/one/path?name=value%26more'] + '/yet/one/path?name=value%26more', + '/some/pathxname=value'] bad = ['/some/path?name=value' '/another/path?', '/another/path?name=value', '/yet/one/path?name=value&more'] From e66d7743feab40aa3aa86a15e0ffda9db4d3cd7e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 13:00:54 +0300 Subject: [PATCH 4/6] Fix typo. --- Lib/urllib/robotparser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 9378616153279e..d1c682ea231ada 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -231,7 +231,7 @@ def __init__(self, path, allowance): raise ValueError('$ not at the end of path') self.matcher = None if '*' in path: - pattern = re.compile(translite_pattern(path), re.DOTALL) + pattern = re.compile(translate_pattern(path), re.DOTALL) if self.fullmatch: self.matcher = pattern.fullmatch else: @@ -338,7 +338,7 @@ def normalize_pattern(path): path += '?' + query return path -def translite_pattern(path): +def translate_pattern(path): parts = list(map(re.escape, path.split('*'))) for i in range(1, len(parts)-1): parts[i] = f'(?>.*?{parts[i]})' From ca6c0d5818c43d298a1825a6bcd572d5b874b4de Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 14:11:37 +0300 Subject: [PATCH 5/6] Update some comments and docs. --- Doc/library/urllib.robotparser.rst | 2 +- Lib/urllib/robotparser.py | 14 +++++++++----- .../2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst | 1 + 3 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index 492c65ae209d92..1fa7fc13baa539 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -18,7 +18,7 @@ This module provides a single class, :class:`RobotFileParser`, which answers questions about whether or not a particular user agent can fetch a URL on the website that published the :file:`robots.txt` file. For more details on the -structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. +structure of :file:`robots.txt` files, see :rfc:`9309`. .. class:: RobotFileParser(url='') diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index d1c682ea231ada..98c938cadea84a 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -7,7 +7,7 @@ 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in - http://www.robotstxt.org/norobots-rfc.txt + RFC 9309 """ import collections @@ -240,6 +240,8 @@ def __init__(self, path, allowance): self.allowance = allowance def applies_to(self, filename): + # If the filename matches the rule, return the matching length plus 1. + # If it does not match, return 0. if self.matcher is not None: m = self.matcher(filename) if m: @@ -298,10 +300,12 @@ def applies_to(self, useragent): def allowance(self, filename): """Preconditions: - - rules without wildcards are sorted from longest to shortest, - "Allow" before "Disallow" - our agent applies to this entry - - filename is URL encoded""" + - filename is URL encoded + - rules are sorted: + - wildcards before literal paths + - literal paths from longest to shortest, "Allow" before "Disallow" + """ best_match = -1 allowance = True for line in self.rulelines: @@ -312,7 +316,7 @@ def allowance(self, filename): allowance = line.allowance elif m == best_match and not allowance: allowance = line.allowance - # Optimization. + # Optimization. Requires rules to be sorted. if line.matcher is None and (m or len(line.path) + 1 < best_match): break return allowance diff --git a/Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst b/Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst new file mode 100644 index 00000000000000..cc996a85f1c167 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst @@ -0,0 +1 @@ +Support :rfc:`9309` in :mod:`urllib.robotparser`. From 4febfe9253c22a9890dcf9df9ca955f352615bed Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 25 Apr 2026 14:19:10 +0300 Subject: [PATCH 6/6] Prefer full match for user agent. --- Lib/test/test_robotparser.py | 11 +++++++---- Lib/urllib/robotparser.py | 4 ++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 4ea6dd03a5ddf1..2ce916fedc2708 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -260,22 +260,25 @@ class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase): class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): - # the order of User-agent should be correct. note - # that this file is incorrect because "Googlebot" is a - # substring of "Googlebot-Mobile" + # the order of User-agent should not matter robots_txt = """\ User-agent: Googlebot Disallow: / +Allow: /folder1/ User-agent: Googlebot-Mobile Allow: / +Disallow: /folder1/ """ agent = 'Googlebot' bad = ['/something.jpg'] + good = ['/folder1/myfile.html'] class UserAgentGoogleMobileTest(UserAgentOrderingTest): - agent = 'Googlebot-Mobile' + agent = 'Googlebot-mobile' + bad = ['/folder1/myfile.html'] + good = ['/something.jpg'] class LongestMatchTest(BaseRobotTest, unittest.TestCase): diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 98c938cadea84a..13e016ff74c2ee 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -76,6 +76,7 @@ def read(self): def _add_entry(self, entry): self.entries.append(entry) for agent in entry.useragents: + agent = agent.lower() if agent not in self.groups: self.groups[agent] = entry else: @@ -156,6 +157,9 @@ def parse(self, lines): self._add_entry(entry) def _find_entry(self, useragent): + entry = self.groups.get(useragent.lower()) + if entry is not None: + return entry for entry in self.groups.values(): if entry.applies_to(useragent): return entry