Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 59 additions & 15 deletions collectoss/tasks/github/facade_github/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from collectoss.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import *
from collectoss.application.db.lib import bulk_insert_dicts
from collectoss.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor

from collectoss.tasks.github.util.util import sanity_check_email



Expand Down Expand Up @@ -59,7 +59,31 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id
# logger.debug("Failed local login lookup")
# else:
# login = contributors_with_matching_name[0].gh_login



# Here we attempt to detect and parse the user id from the email address directly
# this is helpful in case the contributor uses a github noreply address for commits.

# to do this we should first do some sanity checks on the email
email_check = sanity_check_email(email)
if email_check is not None and 'users.noreply.github.com' in email_check:
# we have a noreply address

# the email check explicitly looks for presence of an @ sign
email_user = email_check.split("@")[0]
if '+' in email_user:
username_parts = email_user.split("+")
user_id, login = username_parts[0], username_parts[-1]
else:
user_id = email_user

if not user_id.isnumeric():
logger.warning(
f"Something went wrong parsing user id '{user_id}` from github noreply user {login} <{email_check}>. "
f"Falling back to regular lookup process"
)
login = None
user_id = None

# Try to get the login from the commit sha
if login == None or login == "":
Expand All @@ -72,19 +96,8 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id

if login == None or login == "":
logger.error("Failed to get login from supplemental data!")

unresolved = {
"email": email,
"name": name,
}
logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}")

try:
unresolved_natural_keys = ['email']
bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys)
except Exception as e:
logger.error(
f"Could not create new unresolved email {email}. Error: {e}")
mark_unresolved(name, email, logger)

# move on to the next contributor
continue

Expand All @@ -96,6 +109,21 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id
logger.warning(f"User of {login} not found on github. Skipping...")
continue

# if we had a noreply address and there wasnt an issue parsing a numeric user ID,
# return now that we have the user record to validate that the profile we fetched for the username
# does indeed match the correct user
if email_check is not None and 'users.noreply.github.com' in email_check and user_id is not None:
gh_user_src_id = user_data.get("id")

if gh_user_src_id != int(user_id):
logger.warning(
f"github noreply user src id {gh_user_src_id} doesn't match the ID from their github noreply email: {email_check}."
"Marking as unresolved and skipping to avoid inserting mismatched data."
)
mark_unresolved(name, email, logger)
continue


# Use the email found in the commit data if api data is NULL
emailFromCommitData = contributor['email_raw'] if 'email_raw' in contributor else contributor['email']

Expand Down Expand Up @@ -157,6 +185,22 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id
return


def mark_unresolved(name, email, logger):
unresolved = {
"email": email,
"name": name,
}
logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}")

try:
unresolved_natural_keys = ['email']
bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys)
except Exception as e:
logger.error(
f"Could not create new unresolved email {email}. Error: {e}")



def link_commits_to_contributor(logger, facade_helper, contributorQueue):

# # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id.
Expand Down
98 changes: 98 additions & 0 deletions collectoss/tasks/github/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,104 @@
from collectoss.application.db.lib import get_repo_by_repo_git
from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps

def extract_email(possible_email:str) -> str | None:
"""Accept a string that might contain an email and extract the email from it for later validation.
An email is searched for by looking for the first @ sign, and then looking for a space (or the end of string) on either side

If it becomes heavily used, we should defer to a well-made library

Args:
possible_email (str): a string that might contain an email

Returns:
str: a string representing a potential email address extracted from the string. None is returned if no valid email could be found.
"""
try:

if possible_email is None:
return None

candidate_email = str(possible_email).strip()

at_pos = candidate_email.find("@")
if at_pos == -1:
# email cannot possibly be valid without an @ sign
return None

preceeding_space = candidate_email.rfind(" ", 0, at_pos)

Check failure on line 36 in collectoss/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] collectoss/tasks/github/util/util.py#L36

"preceeding" is a misspelling of "preceding"
Raw output
./collectoss/tasks/github/util/util.py:36:8: "preceeding" is a misspelling of "preceding"
following_space = candidate_email.find(" ", at_pos)
search_result = (preceeding_space != -1, following_space != -1)

Check failure on line 38 in collectoss/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] collectoss/tasks/github/util/util.py#L38

"preceeding" is a misspelling of "preceding"
Raw output
./collectoss/tasks/github/util/util.py:38:25: "preceeding" is a misspelling of "preceding"

if search_result == (True, True):
# spaces were on either side
candidate_email = candidate_email[preceeding_space+1:following_space]

Check failure on line 42 in collectoss/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] collectoss/tasks/github/util/util.py#L42

"preceeding" is a misspelling of "preceding"
Raw output
./collectoss/tasks/github/util/util.py:42:46: "preceeding" is a misspelling of "preceding"
elif search_result == (False, True):
# space after
candidate_email = candidate_email[:following_space + 1]
elif search_result == (True, False):
# space after
candidate_email = candidate_email[preceeding_space:]

Check failure on line 48 in collectoss/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] collectoss/tasks/github/util/util.py#L48

"preceeding" is a misspelling of "preceding"
Raw output
./collectoss/tasks/github/util/util.py:48:46: "preceeding" is a misspelling of "preceding"
# otherwise, there were no spaces, so the string stays the same.
return candidate_email
except Exception:
return None

def sanity_check_email(possible_email:str) -> str | None:
"""Accept a string that might contain an email and attempt to validate it
This was built for performing some basic sanity checking before attempting to use
the information in an email address for processing.

It is built to be simple without new libraries for now
since the intent is more for basic sanity checks to aid in parsing a known format.
It is not (yet) fully RFC-compliant.

If it becomes heavily used, we should defer to a well-made library

Args:
possible_email (str): a string that might contain an email

Returns:
str: a string representing what should (mostly) be a valid email address. None is returned if no valid email could be found.
"""

if possible_email is None:
return None

if not isinstance(possible_email, str):
raise TypeError("Email address values must be strings")

try:

candidate_email = str(possible_email).strip()

if candidate_email.count(" ") > 0:
extracted_email = extract_email(candidate_email)
if not extracted_email:
# could not extract
return None
else:
candidate_email = extracted_email

if not candidate_email.isascii():
# non-ascii is pretty uncommon, especially for narrow usecases like
# checking for an existing domain like the github noreply domain.
# if this is insufficient, a library may be a better option.

return None

email_parts = candidate_email.split("@")
if len(email_parts) > 2:
# Emails cannot have more than one @
return None

if '' in email_parts:
# Email cant possibly be valid if there is nothing both before and after the @
return None

return candidate_email
except Exception:
return None

def get_repo_src_id(owner, repo, logger):


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import pytest

from collectoss.tasks.github.util.util import sanity_check_email, extract_email

class TestSanityCheckEmail:

@pytest.mark.unit
def test_simple_valid_emails(self):
assert sanity_check_email("user@example.com") == "user@example.com"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

#This is the primary real-world use case per the caller in facade_github.
result = sanity_check_email("12345+username@users.noreply.github.com")
assert result == "12345+username@users.noreply.github.com"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_simple_valid_emails_with_padding(self):
assert sanity_check_email(" user@example.com ") == "user@example.com"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

#This is the primary real-world use case per the caller in facade_github.
result = sanity_check_email(" 12345+username@users.noreply.github.com ")
assert result == "12345+username@users.noreply.github.com"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


@pytest.mark.unit
def test_email_with_plus_addressing(self):
assert sanity_check_email("user+tag@example.com") == "user+tag@example.com"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_email_with_dots_in_local(self):
assert sanity_check_email("first.last@example.com") == "first.last@example.com"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_email_with_subdomain(self):
assert sanity_check_email("user@mail.example.co.uk") == "user@mail.example.co.uk"

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_non_ascii(self):
assert sanity_check_email("üser@example.com") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
assert sanity_check_email("user@exämple.com") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
assert sanity_check_email("user😀@example.com") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

class TestSanityCheckEmailEdgeCasesInput:

@pytest.mark.unit
def test_integer_input_returns_none(self):
with pytest.raises(TypeError):
assert sanity_check_email(42) is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_boolean_input_returns_none(self):
with pytest.raises(TypeError):
assert sanity_check_email(True) is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


class TestSanityCheckEmailEdgeCases:

@pytest.mark.unit
def test_at_sign_only(self):
assert sanity_check_email("@") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_at_sign_with_domain_no_local(self):
assert sanity_check_email("@example.com") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_local_with_at_no_domain(self):
assert sanity_check_email("user@") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

@pytest.mark.unit
def test_multiple_at_signs(self):
assert sanity_check_email("user@@example.com") is None

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.


class TestEmailExtraction:

@pytest.mark.unit
def test_embedded_email_in_text_returns_email(self):
"""This is intended to test cases where the user put their name and email in the same field, either before or after"""
assert extract_email("Name user@example.com and more") == 'user@example.com'

Check notice

Code scanning / Bandit

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Note test

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
Loading