From 5051e06cb32200f0b68c2731d4b4d3c58b09d5b7 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 15:52:35 -0400 Subject: [PATCH 01/11] factor mark_unresolved into a function so it can be reused Signed-off-by: Adrian Edwards --- .../tasks/github/facade_github/tasks.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index ab7a18eab..60038d9c7 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -72,19 +72,8 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id if login == None or login == "": logger.error("Failed to get login from supplemental data!") - - unresolved = { - "email": email, - "name": name, - } - logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}") - - try: - unresolved_natural_keys = ['email'] - bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) - except Exception as e: - logger.error( - f"Could not create new unresolved email {email}. Error: {e}") + mark_unresolved(name, email, logger) + # move on to the next contributor continue @@ -157,6 +146,22 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id return +def mark_unresolved(name, email, logger): + unresolved = { + "email": email, + "name": name, + } + logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}") + + try: + unresolved_natural_keys = ['email'] + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + except Exception as e: + logger.error( + f"Could not create new unresolved email {email}. Error: {e}") + + + def link_commits_to_contributor(logger, facade_helper, contributorQueue): # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. From e3582a8f74f86fd98ab04c0dc9fa948d16422217 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 17:27:09 -0400 Subject: [PATCH 02/11] create a helper function for sanity checking email addresses Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index a0f009855..487a2cfc8 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -9,6 +9,63 @@ from collectoss.application.db.lib import get_repo_by_repo_git from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps + +def sanity_check_email(possible_email:str) -> str: + """Accept a string that might contain an email and attempt to validate it + This was built for performing some basic sanity checking before attempting to use + the information in an email address for processing. + + It is built to be simple without new libraries for now + since the intent is more for basic sanity checks to aid in parsing a known format. + It is not (yet) fully RFC-compliant. + + If it becomes heavily used, we should defer to a well-made library + + Args: + possible_email (str): a string that might contain an email + + Returns: + str: a string representing what should (mostly) be a valid email address. None is returned if no valid email could be found. + """ + + try: + if possible_email is None: + return None + + candidate_email = str(possible_email) + + + at_pos = candidate_email.find("@") + if at_pos == -1: + # email cannot possibly be valid without an @ sign + return None + + preceeding_space = candidate_email.rfind(" ", 0) + following_space = candidate_email.find(" ", at_pos) + search_result = (preceeding_space != -1, following_space != -1) + + if search_result == (True, True): + # spaces were on either side + candidate_email = candidate_email[preceeding_space, following_space + 1] + elif search_result == (False, True): + # space after + candidate_email = candidate_email[:following_space + 1] + elif search_result == (True, False): + # space after + candidate_email = candidate_email[preceeding_space:] + # otherwise, there were no spaces, so the string stays the same. + + if not candidate_email.isascii(): + # non-ascii is pretty uncommon, especially for narrow usecases like + # checking for an existing domain like the github noreply domain. + # if this is insufficient, a library may be a better option. + + return None + + return candidate_email + except Exception: + return None + def get_repo_src_id(owner, repo, logger): From 00d13ebe4537f9da254d9e9cdbdded2d71dfdb86 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 17:28:06 -0400 Subject: [PATCH 03/11] attempt to shortcut contributor resolution when github noreply users are found Signed-off-by: Adrian Edwards --- .../tasks/github/facade_github/tasks.py | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index 60038d9c7..9e1ca47e2 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -11,7 +11,7 @@ from collectoss.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * from collectoss.application.db.lib import bulk_insert_dicts from collectoss.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor - +from collectoss.tasks.github.util.util import sanity_check_email @@ -59,7 +59,31 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id # logger.debug("Failed local login lookup") # else: # login = contributors_with_matching_name[0].gh_login - + + + # Here we attempt to detect and parse the user id from the email address directly + # this is helpful in case the contributor uses a github noreply address for commits. + + # to do this we should first do some sanity checks on the email + email_check = sanity_check_email(email) + if email_check is not None and 'users.noreply.github.com' in email_check: + # we have a noreply address + + # the email check explicitly looks for presence of an @ sign + email_user = email_check.split("@")[0] + if '+' in email_user: + username_parts = email_user.split("+") + user_id, login = username_parts[0], username_parts[-1] + else: + user_id = email_user + + if not user_id.isnumeric(): + logger.warning( + f"Something went wrong parsing user id '{user_id}` from github noreply user {login} <{email_check}>. " + f"Falling back to regular lookup process" + ) + login = None + user_id = None # Try to get the login from the commit sha if login == None or login == "": @@ -85,6 +109,21 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id logger.warning(f"User of {login} not found on github. Skipping...") continue + # if we had a noreply address and there wasnt an issue parsing a numeric user ID, + # return now that we have the user record to validate that the profile we fetched for the username + # does indeed match the correct user + if email_check is not None and 'users.noreply.github.com' in email_check and user_id is not None: + gh_user_src_id = user_data.get("id") + + if gh_user_src_id != int(user_id): + logger.warning( + f"github noreply user src id {gh_user_src_id} doesn't match the ID from their github noreply email: {email_check}." + "Marking as unresolved and skipping to avoid inserting mismatched data." + ) + mark_unresolved(name, email, logger) + continue + + # Use the email found in the commit data if api data is NULL emailFromCommitData = contributor['email_raw'] if 'email_raw' in contributor else contributor['email'] From 61218367e40427ee8cee7003e1ca6b1d8e622d50 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 14:54:39 -0400 Subject: [PATCH 04/11] fixes to email extraction Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index 487a2cfc8..fa78e294c 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -40,17 +40,17 @@ def sanity_check_email(possible_email:str) -> str: # email cannot possibly be valid without an @ sign return None - preceeding_space = candidate_email.rfind(" ", 0) + preceeding_space = candidate_email.rfind(" ", 0, at_pos) following_space = candidate_email.find(" ", at_pos) search_result = (preceeding_space != -1, following_space != -1) if search_result == (True, True): # spaces were on either side - candidate_email = candidate_email[preceeding_space, following_space + 1] - elif search_result == (False, True): + candidate_email = candidate_email[preceeding_space+1:following_space] + elif search_result == (False, True): # space after candidate_email = candidate_email[:following_space + 1] - elif search_result == (True, False): + elif search_result == (True, False): # space after candidate_email = candidate_email[preceeding_space:] # otherwise, there were no spaces, so the string stays the same. From df4ae4ee15e914d267367c27a40d1f27c38f8def Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 14:55:44 -0400 Subject: [PATCH 05/11] calm type warnings Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index fa78e294c..f6e89f6ad 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -10,7 +10,7 @@ from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps -def sanity_check_email(possible_email:str) -> str: +def sanity_check_email(possible_email:str) -> str | None: """Accept a string that might contain an email and attempt to validate it This was built for performing some basic sanity checking before attempting to use the information in an email address for processing. From d7a7fcecfb85ddde7c9475d93c6cd4059627d192 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 14:56:43 -0400 Subject: [PATCH 06/11] replace email extraction with a strip() call for handling spaces either side We are assuming we are working with a plain email Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index f6e89f6ad..98590fd9e 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -32,28 +32,7 @@ def sanity_check_email(possible_email:str) -> str | None: if possible_email is None: return None - candidate_email = str(possible_email) - - - at_pos = candidate_email.find("@") - if at_pos == -1: - # email cannot possibly be valid without an @ sign - return None - - preceeding_space = candidate_email.rfind(" ", 0, at_pos) - following_space = candidate_email.find(" ", at_pos) - search_result = (preceeding_space != -1, following_space != -1) - - if search_result == (True, True): - # spaces were on either side - candidate_email = candidate_email[preceeding_space+1:following_space] - elif search_result == (False, True): - # space after - candidate_email = candidate_email[:following_space + 1] - elif search_result == (True, False): - # space after - candidate_email = candidate_email[preceeding_space:] - # otherwise, there were no spaces, so the string stays the same. + candidate_email = str(possible_email).strip() if not candidate_email.isascii(): # non-ascii is pretty uncommon, especially for narrow usecases like From d446c7b6ad08b4420acc5f9f19c6af9df163c036 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 14:58:23 -0400 Subject: [PATCH 07/11] factor out email extraction to a new function This helps handle the edge case where there are other words besides an email, separated by a space, in the string. Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index 98590fd9e..cec87ef10 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -9,6 +9,47 @@ from collectoss.application.db.lib import get_repo_by_repo_git from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps +def extract_email(possible_email:str) -> str | None: + """Accept a string that might contain an email and extract the email from it for later validation. + An email is searched for by looking for the first @ sign, and then looking for a space (or the end of string) on either side + + If it becomes heavily used, we should defer to a well-made library + + Args: + possible_email (str): a string that might contain an email + + Returns: + str: a string representing a potential email address extracted from the string. None is returned if no valid email could be found. + """ + try: + + if possible_email is None: + return None + + candidate_email = str(possible_email).strip() + + at_pos = candidate_email.find("@") + if at_pos == -1: + # email cannot possibly be valid without an @ sign + return None + + preceeding_space = candidate_email.rfind(" ", 0, at_pos) + following_space = candidate_email.find(" ", at_pos) + search_result = (preceeding_space != -1, following_space != -1) + + if search_result == (True, True): + # spaces were on either side + candidate_email = candidate_email[preceeding_space+1:following_space] + elif search_result == (False, True): + # space after + candidate_email = candidate_email[:following_space + 1] + elif search_result == (True, False): + # space after + candidate_email = candidate_email[preceeding_space:] + # otherwise, there were no spaces, so the string stays the same. + return candidate_email + except Exception: + return None def sanity_check_email(possible_email:str) -> str | None: """Accept a string that might contain an email and attempt to validate it @@ -34,6 +75,9 @@ def sanity_check_email(possible_email:str) -> str | None: candidate_email = str(possible_email).strip() + if candidate_email.count(" ") > 0: + candidate_email = extract_email(candidate_email) + if not candidate_email.isascii(): # non-ascii is pretty uncommon, especially for narrow usecases like # checking for an existing domain like the github noreply domain. From 78d0f462183be961c6f317240411f8e2bc24d8cb Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 14:58:44 -0400 Subject: [PATCH 08/11] explicitly fail to validate emails with multiple @ signs or missing text either side of the @ Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index cec87ef10..f99e30b72 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -85,6 +85,15 @@ def sanity_check_email(possible_email:str) -> str | None: return None + email_parts = candidate_email.split("@") + if len(email_parts) > 2: + # Emails cannot have more than one @ + return None + + if '' in email_parts: + # Email cant possibly be valid if there is nothing both before and after the @ + return None + return candidate_email except Exception: return None From a78202b20f0d0ba9b2c14c2979f9b464ca94c689 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 15:01:01 -0400 Subject: [PATCH 09/11] explicitly reject non string types Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 3 + .../test_util/test_email_validation.py | 79 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tests/test_tasks/test_task_utilities/test_util/test_email_validation.py diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index f99e30b72..81c17651f 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -72,6 +72,9 @@ def sanity_check_email(possible_email:str) -> str | None: try: if possible_email is None: return None + + if not isinstance(possible_email, str): + raise TypeError("Email address values must be strings") candidate_email = str(possible_email).strip() diff --git a/tests/test_tasks/test_task_utilities/test_util/test_email_validation.py b/tests/test_tasks/test_task_utilities/test_util/test_email_validation.py new file mode 100644 index 000000000..007fa25dc --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_util/test_email_validation.py @@ -0,0 +1,79 @@ +import pytest + +from collectoss.tasks.github.util.util import sanity_check_email, extract_email + +class TestSanityCheckEmail: + + @pytest.mark.unit + def test_simple_valid_emails(self): + assert sanity_check_email("user@example.com") == "user@example.com" + + #This is the primary real-world use case per the caller in facade_github. + result = sanity_check_email("12345+username@users.noreply.github.com") + assert result == "12345+username@users.noreply.github.com" + + @pytest.mark.unit + def test_simple_valid_emails_with_padding(self): + assert sanity_check_email(" user@example.com ") == "user@example.com" + + #This is the primary real-world use case per the caller in facade_github. + result = sanity_check_email(" 12345+username@users.noreply.github.com ") + assert result == "12345+username@users.noreply.github.com" + + + @pytest.mark.unit + def test_email_with_plus_addressing(self): + assert sanity_check_email("user+tag@example.com") == "user+tag@example.com" + + @pytest.mark.unit + def test_email_with_dots_in_local(self): + assert sanity_check_email("first.last@example.com") == "first.last@example.com" + + @pytest.mark.unit + def test_email_with_subdomain(self): + assert sanity_check_email("user@mail.example.co.uk") == "user@mail.example.co.uk" + + @pytest.mark.unit + def test_non_ascii(self): + assert sanity_check_email("üser@example.com") is None + assert sanity_check_email("user@exämple.com") is None + assert sanity_check_email("user😀@example.com") is None + +class TestSanityCheckEmailEdgeCasesInput: + + @pytest.mark.unit + def test_integer_input_returns_none(self): + with pytest.raises(TypeError): + assert sanity_check_email(42) is None + + @pytest.mark.unit + def test_boolean_input_returns_none(self): + with pytest.raises(TypeError): + assert sanity_check_email(True) is None + + +class TestSanityCheckEmailEdgeCases: + + @pytest.mark.unit + def test_at_sign_only(self): + assert sanity_check_email("@") is None + + @pytest.mark.unit + def test_at_sign_with_domain_no_local(self): + assert sanity_check_email("@example.com") is None + + @pytest.mark.unit + def test_local_with_at_no_domain(self): + assert sanity_check_email("user@") is None + + @pytest.mark.unit + def test_multiple_at_signs(self): + assert sanity_check_email("user@@example.com") is None + + +class TestEmailExtraction: + + @pytest.mark.unit + def test_embedded_email_in_text_returns_email(self): + """This is intended to test cases where the user put their name and email in the same field, either before or after""" + assert extract_email("Name user@example.com and more") == 'user@example.com' \ No newline at end of file From 036193b82efed76c9ae7477df9dabc5d792a0fae Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 15:04:41 -0400 Subject: [PATCH 10/11] handle type mismatch Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index 81c17651f..dde4556f0 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -79,7 +79,12 @@ def sanity_check_email(possible_email:str) -> str | None: candidate_email = str(possible_email).strip() if candidate_email.count(" ") > 0: - candidate_email = extract_email(candidate_email) + extracted_email = extract_email(candidate_email) + if not extracted_email: + # could not extract + return None + else: + candidate_email = extracted_email if not candidate_email.isascii(): # non-ascii is pretty uncommon, especially for narrow usecases like From 5713b7a5a355472605537ccd1188c59c25908052 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 15:05:14 -0400 Subject: [PATCH 11/11] move raise of TypeError outside Try block so it alerts the caller. Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/util.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index dde4556f0..225d32efd 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -68,13 +68,14 @@ def sanity_check_email(possible_email:str) -> str | None: Returns: str: a string representing what should (mostly) be a valid email address. None is returned if no valid email could be found. """ - - try: - if possible_email is None: + + if possible_email is None: return None - if not isinstance(possible_email, str): - raise TypeError("Email address values must be strings") + if not isinstance(possible_email, str): + raise TypeError("Email address values must be strings") + + try: candidate_email = str(possible_email).strip()