Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add authentication via Entra with a certificate #3064

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 145 additions & 24 deletions connectors/sources/sharepoint_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from aiofiles.tempfile import NamedTemporaryFile
from aiohttp.client_exceptions import ClientPayloadError, ClientResponseError
from aiohttp.client_reqrep import RequestInfo
from azure.identity import CertificateCredential
from fastjsonschema import JsonSchemaValueException

from connectors.access_control import (
Expand Down Expand Up @@ -193,7 +194,7 @@ class MicrosoftSecurityToken:
- https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app
"""

def __init__(self, http_session, tenant_id, tenant_name, client_id, client_secret):
def __init__(self, http_session, tenant_id, tenant_name, client_id):
"""Initializer.

Args:
Expand All @@ -207,7 +208,6 @@ def __init__(self, http_session, tenant_id, tenant_name, client_id, client_secre
self._tenant_id = tenant_id
self._tenant_name = tenant_name
self._client_id = client_id
self._client_secret = client_secret

self._token_cache = CacheWithTimeout()

Expand Down Expand Up @@ -260,7 +260,16 @@ async def _fetch_token(self):
raise NotImplementedError


class GraphAPIToken(MicrosoftSecurityToken):
class SecretAPIToken(MicrosoftSecurityToken):
def __init__(self, http_session, tenant_id, tenant_name, client_id, client_secret):
super().__init__(http_session, tenant_id, tenant_name, client_id)
self._client_secret = client_secret

async def _fetch_token(self):
return await super()._fetch_token()


class GraphAPIToken(SecretAPIToken):
"""Token to connect to Microsoft Graph API endpoints."""

@retryable(retries=3)
Expand All @@ -283,7 +292,7 @@ async def _fetch_token(self):
return access_token, expires_in


class SharepointRestAPIToken(MicrosoftSecurityToken):
class SharepointRestAPIToken(SecretAPIToken):
"""Token to connect to Sharepoint REST API endpoints."""

@retryable(retries=DEFAULT_RETRY_COUNT)
Expand Down Expand Up @@ -312,6 +321,43 @@ async def _fetch_token(self):
return access_token, expires_in


class EntraAPIToken(MicrosoftSecurityToken):
"""Token to connect to Microsoft Graph API endpoints."""

def __init__(
self,
http_session,
tenant_id,
tenant_name,
client_id,
certificate,
private_key,
scope,
):
super().__init__(http_session, tenant_id, tenant_name, client_id)
self._certificate = certificate
self._private_key = private_key
self._scope = scope

@retryable(retries=3)
async def _fetch_token(self):
"""Fetch API token for usage with Graph API

Returns:
(str, int) - a tuple containing access token as a string and number of seconds it will be valid for as an integer
"""

secrets_concat = self._certificate + "\n" + self._private_key

credentials = CertificateCredential(
self._tenant_id, self._client_id, certificate_data=secrets_concat.encode()
)

token = credentials.get_token(self._scope)

return token.token, token.expires_on


def retryable_aiohttp_call(retries):
# TODO: improve utils.retryable to allow custom logic
# that can help choose what to retry
Expand Down Expand Up @@ -522,7 +568,15 @@ def _compute_retry_after(self, retry_after, retry_count, backoff):


class SharepointOnlineClient:
def __init__(self, tenant_id, tenant_name, client_id, client_secret):
def __init__(
self,
tenant_id,
tenant_name,
client_id,
client_secret=None,
certificate=None,
private_key=None,
):
# Sharepoint / Graph API has quite strict throttling policies
# If connector is overzealous, it can be banned for not respecting throttling policies
# However if connector has a low setting for the tcp_connector limit, then it'll just be slow.
Expand All @@ -544,12 +598,35 @@ def __init__(self, tenant_id, tenant_name, client_id, client_secret):
"https://(.*).sharepoint.com"
) # Used later for url validation

self.graph_api_token = GraphAPIToken(
self._http_session, tenant_id, tenant_name, client_id, client_secret
)
self.rest_api_token = SharepointRestAPIToken(
self._http_session, tenant_id, tenant_name, client_id, client_secret
)
if client_secret and not certificate and not private_key:
self.graph_api_token = GraphAPIToken(
self._http_session, tenant_id, tenant_name, client_id, client_secret
)
self.rest_api_token = SharepointRestAPIToken(
self._http_session, tenant_id, tenant_name, client_id, client_secret
)
elif certificate and private_key:
self.graph_api_token = EntraAPIToken(
self._http_session,
tenant_id,
tenant_name,
client_id,
certificate,
private_key,
"https://graph.microsoft.com/.default",
)
self.rest_api_token = EntraAPIToken(
self._http_session,
tenant_id,
tenant_name,
client_id,
certificate,
private_key,
f"https://{self._tenant_name}.sharepoint.com/.default",
seanstory marked this conversation as resolved.
Show resolved Hide resolved
)
else:
msg = "Unexpected authentication: either a client_secret or certificate+private_key should be provided"
raise Exception(msg)

self._logger = logger

Expand Down Expand Up @@ -1157,11 +1234,27 @@ def client(self):
tenant_id = self.configuration["tenant_id"]
tenant_name = self.configuration["tenant_name"]
client_id = self.configuration["client_id"]
auth_method = self.configuration["auth_method"]
client_secret = self.configuration["secret_value"]
certificate = self.configuration["certificate"]
private_key = self.configuration["private_key"]

self._client = SharepointOnlineClient(
tenant_id, tenant_name, client_id, client_secret
)
if auth_method == "secret":
self._client = SharepointOnlineClient(
tenant_id, tenant_name, client_id, client_secret=client_secret
)
elif auth_method == "certificate":
self._client = SharepointOnlineClient(
tenant_id,
tenant_name,
client_id,
client_secret,
certificate=certificate,
private_key=private_key,
)
else:
msg = f"Unexpected auth method: {auth_method}"
raise Exception(msg)

return self._client

Expand All @@ -1183,41 +1276,69 @@ def get_default_configuration(cls):
"order": 3,
"type": "str",
},
"auth_method": {
"label": "Authentication Method",
"order": 4,
"type": "str",
"display": "dropdown",
"options": [
{"label": "Client Secret", "value": "secret"},
{"label": "Certificate", "value": "certificate"},
],
"value": "certificate",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will be a breaking change, as on upgrade, we'll add this new config to existing connectors, right? But existing connectors will be using secret? I think we should default to secret, but tooltip that certificate is preferred?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah you're right, it should be "secret" for backwards compatibility!

},
"secret_value": {
"label": "Secret value",
"order": 4,
"order": 5,
"sensitive": True,
"type": "str",
"depends_on": [{"field": "auth_method", "value": "secret"}],
},
"certificate": {
"label": "Content of certificate file",
"display": "textarea",
"sensitive": True,
"order": 5,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"type": "str",
"depends_on": [{"field": "auth_method", "value": "certificate"}],
},
"private_key": {
"label": "Content of private key file",
"display": "textarea",
"sensitive": True,
"order": 6,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"type": "str",
"depends_on": [{"field": "auth_method", "value": "certificate"}],
},
"site_collections": {
"display": "textarea",
"label": "Comma-separated list of sites",
"tooltip": "A comma-separated list of sites to ingest data from. If enumerating all sites, use * to include all available sites, or specify a list of site names. Otherwise, specify a list of site paths.",
"order": 5,
"order": 7,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"type": "list",
"value": "*",
},
"enumerate_all_sites": {
"display": "toggle",
"label": "Enumerate all sites?",
"tooltip": "If enabled, sites will be fetched in bulk, then filtered down to the configured list of sites. This is efficient when syncing many sites. If disabled, each configured site will be fetched with an individual request. This is efficient when syncing fewer sites.",
"order": 6,
"order": 8,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"type": "bool",
"value": True,
},
"fetch_subsites": {
"display": "toggle",
"label": "Fetch sub-sites of configured sites?",
"tooltip": "Whether subsites of the configured site(s) should be automatically fetched.",
"order": 7,
"order": 9,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"type": "bool",
"value": True,
"depends_on": [{"field": "enumerate_all_sites", "value": False}],
},
"use_text_extraction_service": {
"display": "toggle",
"label": "Use text extraction service",
"order": 8,
"order": 10,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"tooltip": "Requires a separate deployment of the Elastic Text Extraction Service. Requires that pipeline settings disable text extraction.",
"type": "bool",
"ui_restrictions": ["advanced"],
Expand All @@ -1226,7 +1347,7 @@ def get_default_configuration(cls):
"use_document_level_security": {
"display": "toggle",
"label": "Enable document level security",
"order": 9,
"order": 11,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"tooltip": "Document level security ensures identities and permissions set in Sharepoint Online are maintained in Elasticsearch. This enables you to restrict and personalize read-access users and groups have to documents in this index. Access control syncs ensure this metadata is kept up to date in your Elasticsearch documents.",
"type": "bool",
"value": False,
Expand All @@ -1235,7 +1356,7 @@ def get_default_configuration(cls):
"depends_on": [{"field": "use_document_level_security", "value": True}],
"display": "toggle",
"label": "Fetch drive item permissions",
"order": 10,
"order": 12,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"tooltip": "Enable this option to fetch drive item specific permissions. This setting can increase sync time.",
"type": "bool",
"value": True,
Expand All @@ -1244,7 +1365,7 @@ def get_default_configuration(cls):
"depends_on": [{"field": "use_document_level_security", "value": True}],
"display": "toggle",
"label": "Fetch unique page permissions",
"order": 11,
"order": 13,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"tooltip": "Enable this option to fetch unique page permissions. This setting can increase sync time. If this setting is disabled a page will inherit permissions from its parent site.",
"type": "bool",
"value": True,
Expand All @@ -1253,7 +1374,7 @@ def get_default_configuration(cls):
"depends_on": [{"field": "use_document_level_security", "value": True}],
"display": "toggle",
"label": "Fetch unique list permissions",
"order": 12,
"order": 14,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"tooltip": "Enable this option to fetch unique list permissions. This setting can increase sync time. If this setting is disabled a list will inherit permissions from its parent site.",
"type": "bool",
"value": True,
Expand All @@ -1262,7 +1383,7 @@ def get_default_configuration(cls):
"depends_on": [{"field": "use_document_level_security", "value": True}],
"display": "toggle",
"label": "Fetch unique list item permissions",
"order": 13,
"order": 15,
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved
"tooltip": "Enable this option to fetch unique list item permissions. This setting can increase sync time. If this setting is disabled a list item will inherit permissions from its parent site.",
"type": "bool",
"value": True,
Expand Down
1 change: 1 addition & 0 deletions requirements/framework.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ notion-client==2.2.1
certifi==2024.7.4
aioboto3==12.4.0
pyasn1<0.6.1
azure-identity==1.19.0
Loading