Skip to content

Commit

Permalink
Refactor URL verification logic
Browse files Browse the repository at this point in the history
Move verification to its own function, and make the verification
checks more exhaustive by parsing the URL and adding more test
cases.
  • Loading branch information
facutuesca committed Jul 2, 2024
1 parent a654664 commit 5065bbd
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 3 deletions.
85 changes: 84 additions & 1 deletion tests/unit/forklift/test_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3296,7 3296,7 @@ def test_upload_succeeds_creates_release(
@pytest.mark.parametrize(
"url, expected",
[
("https://xpto.com", False), # Totally different
("https://google.com", False), # Totally different
("https://github.com/foo", False), # Missing parts
("https://github.com/foo/bar/", True), # Exactly the same
("https://github.com/foo/bar/readme.md", True), # Additonal parts
Expand Down Expand Up @@ -4168,3 4168,86 @@ def test_missing_trailing_slash_redirect(pyramid_request):
"/legacy/ (with a trailing slash)"
)
assert resp.headers["Location"] == "/legacy/"


@pytest.mark.parametrize(
("url", "publisher_url", "expected"),
[
( # GitHub trivial case
"https://github.com/owner/project",
"https://github.com/owner/project",
True,
),
( # ActiveState trivial case
"https://platform.activestate.com/owner/project",
"https://platform.activestate.com/owner/project",
True,
),
( # GitLab trivial case
"https://gitlab.com/owner/project",
"https://gitlab.com/owner/project",
True,
),
( # URL is a sub-path of the TP URL
"https://github.com/owner/project/issues",
"https://github.com/owner/project",
True,
),
( # Normalization
"https://GiThUB.com/owner/project/",
"https://github.com/owner/project",
True,
),
( # TP URL is a prefix, but not a parent of the URL
"https://github.com/owner/project22",
"https://github.com/owner/project",
False,
),
( # URL is a parent of the TP URL
"https://github.com/owner",
"https://github.com/owner/project",
False,
),
( # Scheme component does not match
"http://github.com/owner/project",
"https://github.com/owner/project",
False,
),
( # Host component does not match
"https://gitlab.com/owner/project",
"https://github.com/owner/project",
False,
),
( # Host component matches, but contains user and port info
"https://[email protected]:443/owner/project",
"https://github.com/owner/project",
False,
),
( # URL path component is empty
"https://github.com",
"https://github.com/owner/project",
False,
),
( # TP URL path component is empty
# (currently no TPs have an empty path, so even if the given URL is a
# sub-path of the TP URL, we fail the verification)
"https://github.com/owner/project",
"https://github.com",
False,
),
( # Both path components are empty
# (currently no TPs have an empty path, so even if the given URL is the
# same as the TP URL, we fail the verification)
"https://github.com",
"https://github.com",
False,
),
( # Publisher URL is None
"https://github.com/owner/project",
None,
False,
),
],
)
def test_verify_url(url, publisher_url, expected):
assert legacy._verify_url(url, publisher_url) == expected
43 changes: 41 additions & 2 deletions warehouse/forklift/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 25,7 @@
import packaging.utils
import packaging.version
import packaging_legacy.version
import rfc3986
import sentry_sdk
import wtforms
import wtforms.validators
Expand Down Expand Up @@ -367,6 368,45 @@ def _is_duplicate_file(db_session, filename, hashes):
return None


def _verify_url(url: str, publisher_url: str | None) -> bool:
"""
Verify a given URL against a Trusted Publisher URL
A URL is considered "verified" iff it matches the Trusted Publisher URL
such that, when both URLs are normalized:
- The scheme component is the same (e.g: both use `https`)
- The authority component is the same (e.g.: `github.com`)
- The path component is the same, or a sub-path of the Trusted Publisher URL
(e.g.: `org/project` and `org/project/issues.html` will pass verification
against an `org/project` Trusted Publisher path component)
- The path component of the Trusted Publisher URL is not empty
Note: We compare the authority component instead of the host component because
the authority includes the host, and in practice neither URL should have user
nor port information.
"""
if not publisher_url:
return False

publisher_uri = rfc3986.api.uri_reference(publisher_url).normalize()
user_uri = rfc3986.api.uri_reference(url).normalize()
if publisher_uri.path is None:
# Currently no Trusted Publishers have an empty path component,
# so we defensively fail verification.
return False
elif user_uri.path and publisher_uri.path:
is_subpath = publisher_uri.path == user_uri.path or user_uri.path.startswith(
publisher_uri.path "/"
)
else:
is_subpath = publisher_uri.path == user_uri.path

return (
publisher_uri.scheme == user_uri.scheme
and publisher_uri.authority == user_uri.authority
and is_subpath
)


@view_config(
route_name="forklift.legacy.file_upload",
uses_session=True,
Expand Down Expand Up @@ -686,8 726,7 @@ def file_upload(request):
else {
name: {
"url": url,
"verified": publisher_base_url
and url.lower().startswith(publisher_base_url.lower()),
"verified": _verify_url(url=url, publisher_url=publisher_base_url),
}
for name, url in meta.project_urls.items()
}
Expand Down

0 comments on commit 5065bbd

Please sign in to comment.