Easily download Microsoft Sharepoint files
Simple script to pull files from Enterprise Sharepoint
A while back I had a project where someone was required to download some files from Microsoft Sharepoint each day and then upload them into GCP. As you can imagine this was a tedious task so I was asked to write some code to do this instead. This is written in Python and requires the sharepoint URL, client id and secret as well as any credentials for the file upload location (in this case GCP).
First we need to set up the requirements.txt with the required libraries and versions:
Office365-REST-Python-Client==2.3.13
urllib3==1.26.10
google-cloud-datastore==1.12.0
google-cloud-storage==1.28.1
google-cloud-secret-manager==1.0.0Then we need to set up the credentials and the classes. Some things to note here: URL = the unique URL for your sharepoint site site_name = this is in the URL and is whatever your site is called doc_library = the folder location of the files you wish to download sharepoint_client_id & sharepoint_client_secret = these need to be generated by the site admin for your Sharepoint site.
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.files.file import File
import datetime
import os
from google.cloud import secretmanager
url = 'https://example.sharepoint.com/sites/yoursite'
site_name = 'yoursite'
doc_library = 'Documents'
sharepoint_client_id = os.getenv('SHAREPOINT_CLIENT_ID')
sharepoint_client_secret = os.getenv('SHAREPOINT_CLIENT_SECRET')
region_name = "ap-southeast-2"
client_credentials = ClientCredential(sharepoint_client_id, sharepoint_client_secret)
class SharePoint:
def _auth(self):
conn = ClientContext(url).with_credentials(client_credentials)
return conn
def _get_files_list(self, folder_name):
conn = self._auth()
target_folder_url = f'{doc_library}/{folder_name}'
root_folder = conn.web.get_folder_by_server_relative_url(target_folder_url)
root_folder.expand(["Files", "Folders"]).get().execute_query()
return root_folder.files
def download_file(self, file_name, folder_name):
conn = self._auth()
file_url = f'/sites/{site_name}/{doc_library}/{folder_name}/{file_name}'
file = File.open_binary(conn, file_url)
return file.content
def download_latest_file(self, folder_name):
date_format = "%Y-%m-%dT%H:%M:%SZ"
files_list = self._get_files_list(folder_name)
file_dict = {}
for file in files_list:
dt_obj = datetime.datetime.strptime(file.time_last_modified, date_format)
file_dict[file.name] = dt_obj
# sort dict object to get the latest file
file_dict_sorted = {key:value for key, value in sorted(file_dict.items(), key=lambda item:item[1], reverse=True)}
latest_file_name = next(iter(file_dict_sorted))
print(f"Downloading {latest_file_name}")
content = self.download_file(latest_file_name, folder_name)
return latest_file_name, contentThe next step is to set up the main.py with the required functions:
from sharepoint_api import SharePoint
from pathlib import PurePath
import os
import logging
from google.cloud import storage
if 'GCP_PROJECT' in os.environ:
PROJECT = os.environ['GCP_PROJECT']
else:
PROJECT = 'development'
if PROJECT == 'production':
BUCKET_NAME = 'folder_prod'
else:
BUCKET_NAME = 'folder_dev'
LOG = logging.getLogger()
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
def check_file_exists(file_name):
storage_client = storage.Client()
bucket_name = f'{BUCKET_NAME}'
bucket = storage_client.bucket(bucket_name)
file_exists = storage.Blob(bucket=bucket, name=f'upload/{file_name}').exists(storage_client)
if file_exists != True:
upload_gcs_file(file_name)
else:
LOG.info(f"File {file_name} already exists in {bucket_name}, Skipping Upload")
def get_latest_file(folder):
file_locations = ["Usage Reports", "Usage Reports/Playback Sessions", "Usage Reports/Channel Sessions"]
for folder in file_locations:
file_name, content = SharePoint().download_latest_file(folder)
save_file(file_name, content)
check_file_exists(file_name)
return 'OK'
def save_file(file_name, file_obj):
file_dir_path = PurePath(f'/tmp/{file_name}')
with open(file_dir_path, 'wb') as f:
f.write(file_obj)
def upload_gcs_file(file_name):
gcs_client = storage.Client()
gcs_bucket = storage.Client(project=PROJECT).bucket(BUCKET_NAME)
bucket = gcs_client.get_bucket(gcs_bucket)
blob = bucket.blob(f'upload/{file_name}')
blob.upload_from_filename(f'/tmp/{file_name}')
LOG.info(f"Successfully uploaded file {file_name} to {gcs_bucket}")You can see that for my project I needed to determine between develop and prod GCP projects and select a bucket accordingly. This was set up to be run on a pipeline within GCP that would allow us to check an environment variable to select the correct project and bucket to upload the files. Once this has been selected the functions are set up to check if the file already exists in GCP, download the latest file, save the file and then upload it into GCP.
Again I highly recommend that you store your credentials in Secrets Manager or similar and fetch them rather than in the code itself.






