From db992ca92ab2a2a03fbe62936ed40d9074727a57 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 12 Sep 2025 14:59:44 +0300
Subject: [PATCH 1/6] gh-138907: Support RFC 9309 in robotparser

---
 Lib/test/test_robotparser.py | 321 +++++++++++++++++++++++++++++++----
 Lib/urllib/robotparser.py    | 209 +++++++++++++++--------
 2 files changed, 428 insertions(+), 102 deletions(-)

diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index e33723cc70c877..6483d4d071fbc6 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -15,14 +15,18 @@ class BaseRobotTest:
     good = []
     bad = []
     site_maps = None
+    expected_output = None
 
     def __init_subclass__(cls):
         super().__init_subclass__()
         # Remove tests that do nothing.
-        if not cls.good:
-            cls.test_good_urls = None
-        if not cls.bad:
-            cls.test_bad_urls = None
+        if issubclass(cls, unittest.TestCase):
+            if not cls.good:
+                cls.test_good_urls = None
+            if not cls.bad:
+                cls.test_bad_urls = None
+            if cls.expected_output is None:
+                cls.test_string_formatting = None
 
     def setUp(self):
         lines = io.StringIO(self.robots_txt).readlines()
@@ -50,6 +54,8 @@ def test_bad_urls(self):
     def test_site_maps(self):
         self.assertEqual(self.parser.site_maps(), self.site_maps)
 
+    def test_string_formatting(self):
+        self.assertEqual(str(self.parser), self.expected_output)
 
 class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
     robots_txt = """\
@@ -61,6 +67,56 @@ class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
     good = ['/', '/test.html']
     bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
 
+class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
+    # Example from RFC 9309, section 5.1.
+    robots_txt = """\
+User-Agent: *
+Disallow: *.gif$
+Disallow: /example/
+Allow: /publications/
+
+User-Agent: foobot
+Disallow:/
+Allow:/example/page.html
+Allow:/example/allowed.gif
+
+User-Agent: barbot
+User-Agent: bazbot
+Disallow: /example/page.html
+
+User-Agent: quxbot
+    """
+    good = [
+        '/', '/publications/',
+        ('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
+        ('barbot', '/'), ('barbot', '/example/'),
+            ('barbot', '/example/allowed.gif'),
+            ('barbot', '/example/disallowed.gif'),
+            ('barbot', '/publications/'),
+            ('barbot', '/publications/allowed.gif'),
+        ('bazbot', '/'), ('bazbot', '/example/'),
+            ('bazbot', '/example/allowed.gif'),
+            ('bazbot', '/example/disallowed.gif'),
+            ('bazbot', '/publications/'),
+            ('bazbot', '/publications/allowed.gif'),
+        ('quxbot', '/'), ('quxbot', '/example/'),
+            ('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
+            ('quxbot', '/example/disallowed.gif'),
+            ('quxbot', '/publications/'),
+            ('quxbot', '/publications/allowed.gif'),
+        ]
+    bad = [
+        '/example/', '/example/page.html', '/example/allowed.gif',
+            '/example/disallowed.gif',
+            '/publications/allowed.gif',
+        ('foobot', '/'), ('foobot', '/example/'),
+            ('foobot', '/example/disallowed.gif'),
+            ('foobot', '/publications/'),
+            ('foobot', '/publications/allowed.gif'),
+        ('barbot', '/example/page.html'),
+        ('bazbot', '/example/page.html'),
+    ]
+
 
 class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
     robots_txt = """\
@@ -137,6 +193,7 @@ def test_request_rate(self):
 class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
     robots_txt = ''
     good = ['/foo']
+    expected_output = ''
 
 
 class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
@@ -221,17 +278,185 @@ class UserAgentGoogleMobileTest(UserAgentOrderingTest):
     agent = 'Googlebot-Mobile'
 
 
-class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
-    # Google also got the order wrong. You need
-    # to specify the URLs from more specific to more general
+class LongestMatchTest(BaseRobotTest, unittest.TestCase):
+    # Based on example from RFC 9309, section 5.2.
     robots_txt = """\
-User-agent: Googlebot
-Allow: /folder1/myfile.html
-Disallow: /folder1/
+User-agent: *
+Allow: /example/page/
+Disallow: /example/page/disallowed.gif
+Allow: /example/
     """
-    agent = 'googlebot'
-    good = ['/folder1/myfile.html']
-    bad = ['/folder1/anotherfile.html']
+    good = ['/example/', '/example/page/']
+    bad = ['/example/page/disallowed.gif']
+
+
+class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: *
+Allow: /example/page/
+Disallow: *.gif
+Allow: /example/
+    """
+    good = ['/example/', '/example/page/']
+    bad = ['/example/page/disallowed.gif', '/x.gif']
+
+
+class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: *
+Disallow: /spam
+Allow: /spam
+Disallow: /spam
+    """
+    good = ['/spam', '/spam/']
+
+
+class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: *
+Disallow: /spam
+Allow: /spam$
+Disallow: /spam
+Disallow: /eggs$
+Allow: /eggs
+Disallow: /eggs$
+    """
+    good = ['/spam', '/eggs', '/eggs/']
+    bad = ['/spam/']
+
+
+class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: *
+Disallow: /spam
+Allow: *am
+Disallow: /spam
+Disallow: *gs
+Allow: /eggs
+Disallow: *gs
+    """
+    good = ['/spam', '/eggs', '/spam/', '/eggs/']
+
+
+class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: spambot
+Disallow: /some/path
+
+User-agent: spambot
+Disallow: /another/path
+    """
+    agent = 'spambot'
+    bad = ['/some/path', '/another/path']
+
+
+class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: spambot
+Disallow: /some/path
+User-agent: eggsbot
+Disallow: /another/path
+    """
+    good = [('spambot', '/'), ('spambot', '/another/path'),
+            ('eggsbot', '/'), ('eggsbot', '/some/path')]
+    bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
+    expected_output = """\
+User-agent: spambot
+Disallow: /some/path
+
+User-agent: eggsbot
+Disallow: /another/path\
+"""
+
+class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: spambot
+
+User-agent: eggsbot
+Disallow: /some/path
+
+Disallow: /another/path
+    """
+    good = [('spambot', '/'), ('eggsbot', '/')]
+    bad = [
+        ('spambot', '/some/path'), ('spambot', '/another/path'),
+        ('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
+    ]
+    expected_output = """\
+User-agent: spambot
+User-agent: eggsbot
+Disallow: /another/path
+Disallow: /some/path\
+"""
+
+
+class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+Disallow: /some/path
+
+User-agent: *
+Disallow: /another/path
+    """
+    good = ['/', '/some/path']
+    bad = ['/another/path']
+    expected_output = """\
+User-agent: *
+Disallow: /another/path\
+"""
+
+
+class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: *
+Disallow: /some/path
+
+User-agent: spambot
+    """
+    agent = 'spambot'
+    good = ['/', '/some/path']
+    expected_output = """\
+User-agent: *
+Disallow: /some/path
+
+User-agent: spambot
+Allow:\
+"""
+
+
+class WeirdPathTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = f"""\
+User-agent: *
+Disallow: /a$$$
+Disallow: /b$z
+Disallow: /c***
+Disallow: /d***z
+Disallow: /e*$**$$
+Disallow: /f*$**$$z
+Disallow: /g$*$$**
+Disallow: /h$*$$**z
+    """
+    good = ['/b', '/bz', '/ax', '/d', '/f', '/fz', '/gx', '/h', '/hz']
+    bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy', '/g']
+    expected_output = """\
+User-agent: *
+Disallow: /c*
+Disallow: /d*z
+Disallow: /e*$
+Disallow: /a$
+Disallow: /g$\
+"""
+
+
+class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
+    # This test would take many years if use naive translation to regular
+    # expression (* -> .*).
+    N = 50
+    robots_txt = f"""\
+User-agent: *
+Disallow: /{'*a'*N}*b
+    """
+    good = ['/' + 'a'*N + 'a']
+    bad = ['/' + 'a'*N + 'b']
 
 
 class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
@@ -251,19 +476,6 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
            '/yet/one/path?name=value&more']
 
 
-class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
-    # obey first * entry (#4108)
-    robots_txt = """\
-User-agent: *
-Disallow: /some/path
-
-User-agent: *
-Disallow: /another/path
-    """
-    good = ['/another/path']
-    bad = ['/some/path']
-
-
 class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
     robots_txt = """\
 User-agent: *
@@ -365,17 +577,60 @@ class StringFormattingTest(BaseRobotTest, unittest.TestCase):
     """
 
     expected_output = """\
-User-agent: cybermapper
-Disallow: /some/path
-
 User-agent: *
 Crawl-delay: 1
 Request-rate: 3/15
-Disallow: /cyberworld/map/\
+Disallow: /cyberworld/map/
+
+User-agent: cybermapper
+Disallow: /some/path\
 """
 
-    def test_string_formatting(self):
-        self.assertEqual(str(self.parser), self.expected_output)
+
+class ConstructedStringFormattingTest(unittest.TestCase):
+    def test_empty(self):
+        parser = urllib.robotparser.RobotFileParser()
+        self.assertEqual(str(parser), '')
+
+    def test_group_without_rules(self):
+        parser = urllib.robotparser.RobotFileParser()
+        entry = urllib.robotparser.Entry()
+        entry.useragents = ['spambot']
+        parser._add_entry(entry)
+        entry = urllib.robotparser.Entry()
+        entry.useragents = ['hambot']
+        entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
+        parser._add_entry(entry)
+        entry = urllib.robotparser.Entry()
+        entry.useragents = ['eggsbot']
+        parser._add_entry(entry)
+        self.assertEqual(str(parser), """\
+User-agent: spambot
+Allow:
+
+User-agent: hambot
+Disallow: /ham
+
+User-agent: eggsbot
+Allow:\
+""")
+
+    def test_group_without_user_agent(self):
+        parser = urllib.robotparser.RobotFileParser()
+        entry = urllib.robotparser.Entry()
+        entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
+        parser._add_entry(entry)
+        entry = urllib.robotparser.Entry()
+        entry.useragents = ['spambot']
+        entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
+        parser._add_entry(entry)
+        entry = urllib.robotparser.Entry()
+        entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
+        parser._add_entry(entry)
+        self.assertEqual(str(parser), """\
+User-agent: spambot
+Disallow: /spam\
+""")
 
 
 @unittest.skipUnless(
@@ -495,7 +750,7 @@ def test_basic(self):
     def test_can_fetch(self):
         self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
         self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
-        self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
+        self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
         self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
         self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
         self.assertTrue(self.parser.can_fetch('*', self.base_url))
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 4009fd6b58f594..eb85574537bce6 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -21,19 +21,6 @@
 RequestRate = collections.namedtuple("RequestRate", "requests seconds")
 
 
-def normalize(path):
-    unquoted = urllib.parse.unquote(path, errors='surrogateescape')
-    return urllib.parse.quote(unquoted, errors='surrogateescape')
-
-def normalize_path(path):
-    path, sep, query = path.partition('?')
-    path = normalize(path)
-    if sep:
-        query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
-        path += '?' + query
-    return path
-
-
 class RobotFileParser:
     """ This class provides a set of methods to read, parse and answer
     questions about a single robots.txt file.
@@ -42,6 +29,7 @@ class RobotFileParser:
 
     def __init__(self, url=''):
         self.entries = []
+        self.groups = {}
         self.sitemaps = []
         self.default_entry = None
         self.disallow_all = False
@@ -86,13 +74,13 @@ def read(self):
             self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
 
     def _add_entry(self, entry):
-        if "*" in entry.useragents:
-            # the default entry is considered last
-            if self.default_entry is None:
-                # the first default entry wins
-                self.default_entry = entry
-        else:
-            self.entries.append(entry)
+        self.entries.append(entry)
+        for agent in entry.useragents:
+            if agent not in self.groups:
+                self.groups[agent] = entry
+            else:
+                self.groups[agent] = merge_entries(self.groups[agent], entry)
+            sort_rulelines(self.groups[agent].rulelines)
 
     def parse(self, lines):
         """Parse the input lines from a robots.txt file.
@@ -100,6 +88,7 @@ def parse(self, lines):
         We allow that a user-agent: line is not preceded by
         one or more blank lines.
         """
+        entries = []
         # states:
         #   0: start state
         #   1: saw user-agent line
@@ -109,14 +98,6 @@ def parse(self, lines):
 
         self.modified()
         for line in lines:
-            if not line:
-                if state == 1:
-                    entry = Entry()
-                    state = 0
-                elif state == 2:
-                    self._add_entry(entry)
-                    entry = Entry()
-                    state = 0
             # remove optional comment and strip line
             i = line.find('#')
             if i >= 0:
@@ -132,16 +113,23 @@ def parse(self, lines):
                     if state == 2:
                         self._add_entry(entry)
                         entry = Entry()
-                    entry.useragents.append(line[1])
+                    product_token = line[1]
+                    entry.useragents.append(product_token)
                     state = 1
                 elif line[0] == "disallow":
                     if state != 0:
-                        entry.rulelines.append(RuleLine(line[1], False))
                         state = 2
+                        try:
+                            entry.rulelines.append(RuleLine(line[1], False))
+                        except ValueError:
+                            pass
                 elif line[0] == "allow":
                     if state != 0:
-                        entry.rulelines.append(RuleLine(line[1], True))
                         state = 2
+                        try:
+                            entry.rulelines.append(RuleLine(line[1], True))
+                        except ValueError:
+                            pass
                 elif line[0] == "crawl-delay":
                     if state != 0:
                         # before trying to convert to int we need to make
@@ -164,9 +152,15 @@ def parse(self, lines):
                     #  so it doesn't matter where you place it in your file."
                     # Therefore we do not change the state of the parser.
                     self.sitemaps.append(line[1])
-        if state == 2:
+        if state != 0:
             self._add_entry(entry)
 
+    def _find_entry(self, useragent):
+        for entry in self.groups.values():
+            if entry.applies_to(useragent):
+                return entry
+        return self.groups.get('*')
+
     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
         if self.disallow_all:
@@ -179,43 +173,33 @@ def can_fetch(self, useragent, url):
         # calls can_fetch() before calling read().
         if not self.last_checked:
             return False
-        # search for given user agent matches
-        # the first match counts
         # TODO: The private API is used in order to preserve an empty query.
         # This is temporary until the public API starts supporting this feature.
         parsed_url = urllib.parse._urlsplit(url, '')
         url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
-        url = normalize_path(url)
+        url = normalize_uri(url)
         if not url:
             url = "/"
-        for entry in self.entries:
-            if entry.applies_to(useragent):
-                return entry.allowance(url)
-        # try the default entry last
-        if self.default_entry:
-            return self.default_entry.allowance(url)
-        # agent not found ==> access granted
-        return True
+        entry = self._find_entry(useragent)
+        if entry is None:
+            return True
+        return entry.allowance(url)
 
     def crawl_delay(self, useragent):
         if not self.mtime():
             return None
-        for entry in self.entries:
-            if entry.applies_to(useragent):
-                return entry.delay
-        if self.default_entry:
-            return self.default_entry.delay
-        return None
+        entry = self._find_entry(useragent)
+        if entry is None:
+            return None
+        return entry.delay
 
     def request_rate(self, useragent):
         if not self.mtime():
             return None
-        for entry in self.entries:
-            if entry.applies_to(useragent):
-                return entry.req_rate
-        if self.default_entry:
-            return self.default_entry.req_rate
-        return None
+        entry = self._find_entry(useragent)
+        if entry is None:
+            return None
+        return entry.req_rate
 
     def site_maps(self):
         if not self.sitemaps:
@@ -226,7 +210,7 @@ def __str__(self):
         entries = self.entries
         if self.default_entry is not None:
             entries = entries + [self.default_entry]
-        return '\n\n'.join(map(str, entries))
+        return '\n\n'.join(filter(None, map(str, entries)))
 
 class RuleLine:
     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
@@ -235,14 +219,40 @@ def __init__(self, path, allowance):
         if path == '' and not allowance:
             # an empty value means allow all
             allowance = True
-        self.path = normalize_path(path)
+        path = re.sub(r'[*]{2,}', '*', path)
+        path = re.sub(r'[$][$*]+', '$', path)
+        path = normalize_pattern(path)
+        self.fullmatch = path.endswith('$')
+        path = path.rstrip('$')
+        if '$' in path:
+            raise ValueError('$ not at the end of path')
+        self.matcher = None
+        if '*' in path:
+            pattern = re.compile(translite_pattern(path), re.DOTALL)
+            if self.fullmatch:
+                self.matcher = pattern.fullmatch
+            else:
+                self.matcher = pattern.match
+        self.path = path
         self.allowance = allowance
 
     def applies_to(self, filename):
-        return self.path == "*" or filename.startswith(self.path)
+        if self.matcher is not None:
+            m = self.matcher(filename)
+            if m:
+                return m.end() + 1
+        else:
+            if self.fullmatch:
+                if filename == self.path:
+                    return len(self.path) + 1
+            else:
+                if filename.startswith(self.path):
+                    return len(self.path) + 1
+        return 0
 
     def __str__(self):
-        return ("Allow" if self.allowance else "Disallow") + ": " + self.path
+        return (("Allow" if self.allowance else "Disallow") + ": " + self.path
+                + ('$' if self.fullmatch else ''))
 
 
 class Entry:
@@ -254,6 +264,8 @@ def __init__(self):
         self.req_rate = None
 
     def __str__(self):
+        if not self.useragents:
+            return ''
         ret = []
         for agent in self.useragents:
             ret.append(f"User-agent: {agent}")
@@ -262,27 +274,86 @@ def __str__(self):
         if self.req_rate is not None:
             rate = self.req_rate
             ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
-        ret.extend(map(str, self.rulelines))
+        if self.rulelines:
+            ret.extend(map(str, self.rulelines))
+        else:
+            ret.append("Allow:")
         return '\n'.join(ret)
 
     def applies_to(self, useragent):
         """check if this entry applies to the specified agent"""
+        if useragent is None:
+            return '*' in self.useragents
         # split the name token and make it lower case
         useragent = useragent.split("/")[0].lower()
         for agent in self.useragents:
-            if agent == '*':
-                # we have the catch-all agent
-                return True
-            agent = agent.lower()
-            if agent in useragent:
-                return True
+            if agent != '*':
+                agent = agent.lower()
+                if agent in useragent:
+                    return True
         return False
 
     def allowance(self, filename):
         """Preconditions:
+        - rules without wildcards are sorted from longest to shortest,
+          "Allow" before "Disallow"
         - our agent applies to this entry
         - filename is URL encoded"""
+        best_match = -1
+        allowance = True
         for line in self.rulelines:
-            if line.applies_to(filename):
-                return line.allowance
-        return True
+            m = line.applies_to(filename)
+            if m:
+                if m > best_match:
+                    best_match = m
+                    allowance = line.allowance
+                elif m == best_match and not allowance:
+                    allowance = line.allowance
+            # Optimization.
+            if line.matcher is None and (m or len(line.path) + 1 < best_match):
+                break
+        return allowance
+
+
+def normalize(path):
+    unquoted = urllib.parse.unquote(path, errors='surrogateescape')
+    return urllib.parse.quote(unquoted, errors='surrogateescape')
+
+def normalize_uri(path):
+    path, sep, query = path.partition('?')
+    path = normalize(path)
+    if sep:
+        query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
+        path += '?' + query
+    return path
+
+def normalize_pattern(path):
+    path, sep, query = path.partition('?')
+    path = re.sub(r'[^*$]+', lambda m: normalize(m[0]), path)
+    if sep:
+        query = re.sub(r'[^=&*$]+', lambda m: normalize(m[0]), query)
+        path += '?' + query
+    return path
+
+def translite_pattern(path):
+    parts = list(map(re.escape, path.split('*')))
+    for i in range(1, len(parts)-1):
+        parts[i] = f'(?>.*?{parts[i]})'
+    parts[-1] = f'.*{parts[-1]}'
+    return ''.join(parts)
+
+def merge_entries(e1, e2):
+    entry = Entry()
+    entry.useragents = list(filter(set(e2.useragents).__contains__, e1.useragents))
+    entry.rulelines = e1.rulelines + e2.rulelines
+    entry.delay = e1.delay if e2.delay is None else e2.delay
+    entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate
+    return entry
+
+def sort_rulelines(rulelines):
+    def sortkey(line):
+        if line.matcher is not None:
+            return (True,)
+        else:
+            return (False, len(line.path), line.allowance)
+    rulelines.sort(key=sortkey, reverse=True)

From d053f0d68f282dfd673fd7f8ace812faaace925d Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 18 Sep 2025 13:42:36 +0300
Subject: [PATCH 2/6] The /robots.txt URI is implicitly allowed.

---
 Lib/test/test_robotparser.py | 2 +-
 Lib/urllib/robotparser.py    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 6483d4d071fbc6..23091b6c33f8d3 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -158,7 +158,7 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
 User-agent: *
 Disallow: /
     """
-    good = []
+    good = ['/robots.txt']
     bad = ['/cyberworld/map/index.html', '/', '/tmp/']
 
 
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index eb85574537bce6..9378616153279e 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -180,6 +180,9 @@ def can_fetch(self, useragent, url):
         url = normalize_uri(url)
         if not url:
             url = "/"
+        if url == '/robots.txt':
+            # The /robots.txt URI is implicitly allowed.
+            return True
         entry = self._find_entry(useragent)
         if entry is None:
             return True

From c944c1a1849a133694e5a3369ffe98c92d0df324 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 18 Sep 2025 18:02:55 +0300
Subject: [PATCH 3/6] Add more weird test cases.

---
 Lib/test/test_robotparser.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 23091b6c33f8d3..4ea6dd03a5ddf1 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -435,8 +435,11 @@ class WeirdPathTest(BaseRobotTest, unittest.TestCase):
 Disallow: /g$*$$**
 Disallow: /h$*$$**z
     """
-    good = ['/b', '/bz', '/ax', '/d', '/f', '/fz', '/gx', '/h', '/hz']
-    bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy', '/g']
+    good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
+            '/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
+            '/h$$$z', '/h$x$$yz']
+    bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
+           '/e$$', '/ex$y$', '/g']
     expected_output = """\
 User-agent: *
 Disallow: /c*
@@ -470,7 +473,8 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
     good = ['/some/path', '/some/path?',
             '/some/path%3Fname=value', '/some/path?name%3Dvalue',
             '/another/path', '/another/path%3F',
-            '/yet/one/path?name=value%26more']
+            '/yet/one/path?name=value%26more',
+            '/some/pathxname=value']
     bad = ['/some/path?name=value'
            '/another/path?', '/another/path?name=value',
            '/yet/one/path?name=value&more']

From e66d7743feab40aa3aa86a15e0ffda9db4d3cd7e Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Apr 2026 13:00:54 +0300
Subject: [PATCH 4/6] Fix typo.

---
 Lib/urllib/robotparser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 9378616153279e..d1c682ea231ada 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -231,7 +231,7 @@ def __init__(self, path, allowance):
             raise ValueError('$ not at the end of path')
         self.matcher = None
         if '*' in path:
-            pattern = re.compile(translite_pattern(path), re.DOTALL)
+            pattern = re.compile(translate_pattern(path), re.DOTALL)
             if self.fullmatch:
                 self.matcher = pattern.fullmatch
             else:
@@ -338,7 +338,7 @@ def normalize_pattern(path):
         path += '?' + query
     return path
 
-def translite_pattern(path):
+def translate_pattern(path):
     parts = list(map(re.escape, path.split('*')))
     for i in range(1, len(parts)-1):
         parts[i] = f'(?>.*?{parts[i]})'

From ca6c0d5818c43d298a1825a6bcd572d5b874b4de Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Apr 2026 14:11:37 +0300
Subject: [PATCH 5/6] Update some comments and docs.

---
 Doc/library/urllib.robotparser.rst                 |  2 +-
 Lib/urllib/robotparser.py                          | 14 +++++++++-----
 .../2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst |  1 +
 3 files changed, 11 insertions(+), 6 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst

diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst
index 492c65ae209d92..1fa7fc13baa539 100644
--- a/Doc/library/urllib.robotparser.rst
+++ b/Doc/library/urllib.robotparser.rst
@@ -18,7 +18,7 @@
 This module provides a single class, :class:`RobotFileParser`, which answers
 questions about whether or not a particular user agent can fetch a URL on the
 website that published the :file:`robots.txt` file.  For more details on the
-structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
+structure of :file:`robots.txt` files, see :rfc:`9309`.
 
 
 .. class:: RobotFileParser(url='')
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index d1c682ea231ada..98c938cadea84a 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -7,7 +7,7 @@
     2) PSF license for Python 2.2
 
     The robots.txt Exclusion Protocol is implemented as specified in
-    http://www.robotstxt.org/norobots-rfc.txt
+    RFC 9309
 """
 
 import collections
@@ -240,6 +240,8 @@ def __init__(self, path, allowance):
         self.allowance = allowance
 
     def applies_to(self, filename):
+        # If the filename matches the rule, return the matching length plus 1.
+        # If it does not match, return 0.
         if self.matcher is not None:
             m = self.matcher(filename)
             if m:
@@ -298,10 +300,12 @@ def applies_to(self, useragent):
 
     def allowance(self, filename):
         """Preconditions:
-        - rules without wildcards are sorted from longest to shortest,
-          "Allow" before "Disallow"
         - our agent applies to this entry
-        - filename is URL encoded"""
+        - filename is URL encoded
+        - rules are sorted:
+          - wildcards before literal paths
+          - literal paths from longest to shortest, "Allow" before "Disallow"
+        """
         best_match = -1
         allowance = True
         for line in self.rulelines:
@@ -312,7 +316,7 @@ def allowance(self, filename):
                     allowance = line.allowance
                 elif m == best_match and not allowance:
                     allowance = line.allowance
-            # Optimization.
+            # Optimization. Requires rules to be sorted.
             if line.matcher is None and (m or len(line.path) + 1 < best_match):
                 break
         return allowance
diff --git a/Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst b/Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst
new file mode 100644
index 00000000000000..cc996a85f1c167
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-04-25-14-11-24.gh-issue-138907.u21Wnh.rst
@@ -0,0 +1 @@
+Support :rfc:`9309` in :mod:`urllib.robotparser`.

From 4febfe9253c22a9890dcf9df9ca955f352615bed Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Apr 2026 14:19:10 +0300
Subject: [PATCH 6/6] Prefer full match for user agent.

---
 Lib/test/test_robotparser.py | 11 +++++++----
 Lib/urllib/robotparser.py    |  4 ++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 4ea6dd03a5ddf1..2ce916fedc2708 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -260,22 +260,25 @@ class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
 
 
 class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
-    # the order of User-agent should be correct. note
-    # that this file is incorrect because "Googlebot" is a
-    # substring of "Googlebot-Mobile"
+    # the order of User-agent should not matter
     robots_txt = """\
 User-agent: Googlebot
 Disallow: /
+Allow: /folder1/
 
 User-agent: Googlebot-Mobile
 Allow: /
+Disallow: /folder1/
     """
     agent = 'Googlebot'
     bad = ['/something.jpg']
+    good = ['/folder1/myfile.html']
 
 
 class UserAgentGoogleMobileTest(UserAgentOrderingTest):
-    agent = 'Googlebot-Mobile'
+    agent = 'Googlebot-mobile'
+    bad = ['/folder1/myfile.html']
+    good = ['/something.jpg']
 
 
 class LongestMatchTest(BaseRobotTest, unittest.TestCase):
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 98c938cadea84a..13e016ff74c2ee 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -76,6 +76,7 @@ def read(self):
     def _add_entry(self, entry):
         self.entries.append(entry)
         for agent in entry.useragents:
+            agent = agent.lower()
             if agent not in self.groups:
                 self.groups[agent] = entry
             else:
@@ -156,6 +157,9 @@ def parse(self, lines):
             self._add_entry(entry)
 
     def _find_entry(self, useragent):
+        entry = self.groups.get(useragent.lower())
+        if entry is not None:
+            return entry
         for entry in self.groups.values():
             if entry.applies_to(useragent):
                 return entry