Explore

pdf plumber & sharepoint

from ayx import Alteryx, Package

import os, io, base64

import pandas as pd

import pdfplumber

import requests

Package.installPackages(['pdfplumber'])

# ─── Azure AD / Graph config (set as environment variables in Alteryx Designer) ──────────

TENANT_ID = os.getenv("AZURE_TENANT_ID")

CLIENT_ID = os.getenv("AZURE_CLIENT_ID")

CLIENT_SECRET = os.getenv("AZURE_CLIENT_SECRET")

AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"

TOKEN_URL = f"{AUTHORITY}/oauth2/v2.0/token"

SCOPE = "https://graph.microsoft.com/.default"

# ─── Helper: Get token ───────────────────────────────────────────────────────────────────

def get_graph_token():

data = {

"client_id": CLIENT_ID,

"client_secret": CLIENT_SECRET,

"scope": SCOPE,

"grant_type": "client_credentials",

}

r = requests.post(TOKEN_URL, data=data, timeout=60)

r.raise_for_status()

return r.json()["access_token"]

# ─── Helper: Download PDF from share link ─────────────────────────────────────────────────

def graph_get_drive_item_and_bytes(token: str, share_url: str):

b64 = base64.urlsafe_b64encode(share_url.encode("utf-8")).decode("utf-8").rstrip("=")

meta_url = f"https://graph.microsoft.com/v1.0/shares/u!{b64}/driveItem"

meta_res = requests.get(meta_url, headers={"Authorization": f"Bearer {token}"}, timeout=60)

meta_res.raise_for_status()

item = meta_res.json()

# Download content

content_url = f"https://graph.microsoft.com/v1.0/drives/{item['parentReference']['driveId']}/items/{item['id']}/content"

content_res = requests.get(content_url, headers={"Authorization": f"Bearer {token}"}, timeout=120)

content_res.raise_for_status()

return item, content_res.content

# ─── Helper: Upload .txt to same folder ───────────────────────────────────────────────────

def graph_upload_text_to_same_folder(token: str, drive_id: str, folder_path: str, filename: str, content: str):

"""

Uploads (or replaces) a text file in the same folder as the original PDF.

folder_path = path relative to drive root, like '/Documents/Payroll/Input'

"""

if not folder_path.startswith("/"):

folder_path = "/" + folder_path

url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:{folder_path}/{filename}:/content"

res = requests.put(url, headers={"Authorization": f"Bearer {token}"}, data=content.encode('utf-8'))

res.raise_for_status()

return res.json()

# ─── PDF Text Extraction ─────────────────────────────────────────────────────────────────

def read_pdf_text_from_bytes(pdf_bytes: bytes) -> str:

with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:

parts = []

for page in pdf.pages:

t = page.extract_text()

if t:

parts.append(t)

return ("\n\n".join(parts)).strip() or "[No text extracted]"

# ─── Main Flow ────────────────────────────────────────────────────────────────────────────

df_in = Alteryx.read("#1")

out_rows = []

try:

token = get_graph_token()

except Exception as e:

df_out = pd.DataFrame([{"PDF_Path": "", "Text_Path": "", "Status": f"Auth error: {e}"}])

Alteryx.write(df_out, 1)

raise

for _, row in df_in.iterrows():

share_url = str(row['PDF_Path']).strip()

txt_name = ""

status = ""

try:

# 1. Get PDF bytes + metadata

item, pdf_bytes = graph_get_drive_item_and_bytes(token, share_url)

drive_id = item['parentReference']['driveId']

parent_path = item['parentReference']['path'] # e.g. /drive/root:/Documents/Payroll/Input

# Extract folder path relative to drive root:

folder_rel_path = parent_path.split("root:")[-1].rstrip("/")

# 2. Extract text

text = read_pdf_text_from_bytes(pdf_bytes)

# 3. Determine .txt name

pdf_name = item['name']

txt_name = os.path.splitext(pdf_name)[0] + ".txt"

# 4. Upload text file back to same folder

graph_upload_text_to_same_folder(token, drive_id, folder_rel_path, txt_name, text)

status = f"Uploaded {txt_name} to {folder_rel_path}"

except Exception as e:

status = f"Error: {e}"

out_rows.append({

"PDF_Path": share_url,

"Text_Path": txt_name,

"Status": status

})

df_out = pd.DataFrame(out_rows)

Alteryx.write(df_out, 1)

Want to print your doc?
This is not the way.

Try clicking the ··· in the right corner or using a keyboard shortcut (

CtrlP

) instead.