diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index ab7a18eab..9e1ca47e2 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -11,7 +11,7 @@ from collectoss.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * from collectoss.application.db.lib import bulk_insert_dicts from collectoss.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor - +from collectoss.tasks.github.util.util import sanity_check_email @@ -59,7 +59,31 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id # logger.debug("Failed local login lookup") # else: # login = contributors_with_matching_name[0].gh_login - + + + # Here we attempt to detect and parse the user id from the email address directly + # this is helpful in case the contributor uses a github noreply address for commits. + + # to do this we should first do some sanity checks on the email + email_check = sanity_check_email(email) + if email_check is not None and 'users.noreply.github.com' in email_check: + # we have a noreply address + + # the email check explicitly looks for presence of an @ sign + email_user = email_check.split("@")[0] + if '+' in email_user: + username_parts = email_user.split("+") + user_id, login = username_parts[0], username_parts[-1] + else: + user_id = email_user + + if not user_id.isnumeric(): + logger.warning( + f"Something went wrong parsing user id '{user_id}` from github noreply user {login} <{email_check}>. " + f"Falling back to regular lookup process" + ) + login = None + user_id = None # Try to get the login from the commit sha if login == None or login == "": @@ -72,19 +96,8 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id if login == None or login == "": logger.error("Failed to get login from supplemental data!") - - unresolved = { - "email": email, - "name": name, - } - logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}") - - try: - unresolved_natural_keys = ['email'] - bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) - except Exception as e: - logger.error( - f"Could not create new unresolved email {email}. Error: {e}") + mark_unresolved(name, email, logger) + # move on to the next contributor continue @@ -96,6 +109,21 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id logger.warning(f"User of {login} not found on github. Skipping...") continue + # if we had a noreply address and there wasnt an issue parsing a numeric user ID, + # return now that we have the user record to validate that the profile we fetched for the username + # does indeed match the correct user + if email_check is not None and 'users.noreply.github.com' in email_check and user_id is not None: + gh_user_src_id = user_data.get("id") + + if gh_user_src_id != int(user_id): + logger.warning( + f"github noreply user src id {gh_user_src_id} doesn't match the ID from their github noreply email: {email_check}." + "Marking as unresolved and skipping to avoid inserting mismatched data." + ) + mark_unresolved(name, email, logger) + continue + + # Use the email found in the commit data if api data is NULL emailFromCommitData = contributor['email_raw'] if 'email_raw' in contributor else contributor['email'] @@ -157,6 +185,22 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id return +def mark_unresolved(name, email, logger): + unresolved = { + "email": email, + "name": name, + } + logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}") + + try: + unresolved_natural_keys = ['email'] + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + except Exception as e: + logger.error( + f"Could not create new unresolved email {email}. Error: {e}") + + + def link_commits_to_contributor(logger, facade_helper, contributorQueue): # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index a0f009855..225d32efd 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -9,6 +9,104 @@ from collectoss.application.db.lib import get_repo_by_repo_git from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps +def extract_email(possible_email:str) -> str | None: + """Accept a string that might contain an email and extract the email from it for later validation. + An email is searched for by looking for the first @ sign, and then looking for a space (or the end of string) on either side + + If it becomes heavily used, we should defer to a well-made library + + Args: + possible_email (str): a string that might contain an email + + Returns: + str: a string representing a potential email address extracted from the string. None is returned if no valid email could be found. + """ + try: + + if possible_email is None: + return None + + candidate_email = str(possible_email).strip() + + at_pos = candidate_email.find("@") + if at_pos == -1: + # email cannot possibly be valid without an @ sign + return None + + preceeding_space = candidate_email.rfind(" ", 0, at_pos) + following_space = candidate_email.find(" ", at_pos) + search_result = (preceeding_space != -1, following_space != -1) + + if search_result == (True, True): + # spaces were on either side + candidate_email = candidate_email[preceeding_space+1:following_space] + elif search_result == (False, True): + # space after + candidate_email = candidate_email[:following_space + 1] + elif search_result == (True, False): + # space after + candidate_email = candidate_email[preceeding_space:] + # otherwise, there were no spaces, so the string stays the same. + return candidate_email + except Exception: + return None + +def sanity_check_email(possible_email:str) -> str | None: + """Accept a string that might contain an email and attempt to validate it + This was built for performing some basic sanity checking before attempting to use + the information in an email address for processing. + + It is built to be simple without new libraries for now + since the intent is more for basic sanity checks to aid in parsing a known format. + It is not (yet) fully RFC-compliant. + + If it becomes heavily used, we should defer to a well-made library + + Args: + possible_email (str): a string that might contain an email + + Returns: + str: a string representing what should (mostly) be a valid email address. None is returned if no valid email could be found. + """ + + if possible_email is None: + return None + + if not isinstance(possible_email, str): + raise TypeError("Email address values must be strings") + + try: + + candidate_email = str(possible_email).strip() + + if candidate_email.count(" ") > 0: + extracted_email = extract_email(candidate_email) + if not extracted_email: + # could not extract + return None + else: + candidate_email = extracted_email + + if not candidate_email.isascii(): + # non-ascii is pretty uncommon, especially for narrow usecases like + # checking for an existing domain like the github noreply domain. + # if this is insufficient, a library may be a better option. + + return None + + email_parts = candidate_email.split("@") + if len(email_parts) > 2: + # Emails cannot have more than one @ + return None + + if '' in email_parts: + # Email cant possibly be valid if there is nothing both before and after the @ + return None + + return candidate_email + except Exception: + return None + def get_repo_src_id(owner, repo, logger): diff --git a/tests/test_tasks/test_task_utilities/test_util/test_email_validation.py b/tests/test_tasks/test_task_utilities/test_util/test_email_validation.py new file mode 100644 index 000000000..007fa25dc --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_util/test_email_validation.py @@ -0,0 +1,79 @@ +import pytest + +from collectoss.tasks.github.util.util import sanity_check_email, extract_email + +class TestSanityCheckEmail: + + @pytest.mark.unit + def test_simple_valid_emails(self): + assert sanity_check_email("user@example.com") == "user@example.com" + + #This is the primary real-world use case per the caller in facade_github. + result = sanity_check_email("12345+username@users.noreply.github.com") + assert result == "12345+username@users.noreply.github.com" + + @pytest.mark.unit + def test_simple_valid_emails_with_padding(self): + assert sanity_check_email(" user@example.com ") == "user@example.com" + + #This is the primary real-world use case per the caller in facade_github. + result = sanity_check_email(" 12345+username@users.noreply.github.com ") + assert result == "12345+username@users.noreply.github.com" + + + @pytest.mark.unit + def test_email_with_plus_addressing(self): + assert sanity_check_email("user+tag@example.com") == "user+tag@example.com" + + @pytest.mark.unit + def test_email_with_dots_in_local(self): + assert sanity_check_email("first.last@example.com") == "first.last@example.com" + + @pytest.mark.unit + def test_email_with_subdomain(self): + assert sanity_check_email("user@mail.example.co.uk") == "user@mail.example.co.uk" + + @pytest.mark.unit + def test_non_ascii(self): + assert sanity_check_email("üser@example.com") is None + assert sanity_check_email("user@exämple.com") is None + assert sanity_check_email("user😀@example.com") is None + +class TestSanityCheckEmailEdgeCasesInput: + + @pytest.mark.unit + def test_integer_input_returns_none(self): + with pytest.raises(TypeError): + assert sanity_check_email(42) is None + + @pytest.mark.unit + def test_boolean_input_returns_none(self): + with pytest.raises(TypeError): + assert sanity_check_email(True) is None + + +class TestSanityCheckEmailEdgeCases: + + @pytest.mark.unit + def test_at_sign_only(self): + assert sanity_check_email("@") is None + + @pytest.mark.unit + def test_at_sign_with_domain_no_local(self): + assert sanity_check_email("@example.com") is None + + @pytest.mark.unit + def test_local_with_at_no_domain(self): + assert sanity_check_email("user@") is None + + @pytest.mark.unit + def test_multiple_at_signs(self): + assert sanity_check_email("user@@example.com") is None + + +class TestEmailExtraction: + + @pytest.mark.unit + def test_embedded_email_in_text_returns_email(self): + """This is intended to test cases where the user put their name and email in the same field, either before or after""" + assert extract_email("Name user@example.com and more") == 'user@example.com' \ No newline at end of file