import requests
import tempfile
import os
import boto3
from nltk.corpus import stopwords
import nltk
import re
import unicodedata
final_stopwords_list = stopwords.words('english') + stopwords.words('french')
## Load creds
token = 'XXXXX'
headers_coda = {'Authorization': 'Bearer {}'.format(token)}
s3_client = boto3.resource(
's3',
region_name='eu-west-3')
client = boto3.client('textract')
client_language = boto3.client('comprehend', region_name='eu-west-2')
## fetch data
URI_APPLICATION = 'https://coda.io/apis/v1/docs/XXX/tables/XXX/rows'
params = {
'query': 'STATUS_PDF:"to_add"',
}
request_application = requests.get(URI_APPLICATION, headers=headers_coda, params = params)
request_application.json()
len(request_application.json()['items'])
### SAVE PDF
list_payload = []
if len(request_application.json()['items']) > 0:
for i, val in enumerate(request_application.json()['items']):
if len(val['values']['XXX']) >0:
print(val['values']["XXX"])
ID_ROW = val['id']
URL = val['values']["XXX"]
download_file = requests.get(URL, allow_redirects=False)
FILENAME_NEW_PDF = "{}.pdf".format(val['values']["XXX"].upper())
LOCAL_PATH_FILE = os.path.join(
tempfile.gettempdir(), FILENAME_NEW_PDF)
PATH_S3_KEY = os.path.join(
'DATA/SUSTAINABLE_DEVELOPMENT/APPLICATION_CALL',
FILENAME_NEW_PDF
)
with open(LOCAL_PATH_FILE, "wb") as file:
file.write(download_file.content)
### Save S3
s3_client.Bucket('datalake-datascience').upload_file(LOCAL_PATH_FILE,
PATH_S3_KEY)
### analyse PDF
response = client.start_document_analysis(
DocumentLocation={
'S3Object': {
'Bucket': 'datalake-datascience',
'Name': PATH_S3_KEY
}
},
FeatureTypes=['TABLES','FORMS']
)
response_pdf_analysis = client.get_document_analysis(
JobId=response['JobId'],
)
status = response_pdf_analysis['JobStatus']
while status == "IN_PROGRESS":
response_pdf_analysis = client.get_document_analysis(
JobId=response['JobId'],
)
status = response_pdf_analysis['JobStatus']
list_pages = [response_pdf_analysis]
while 'NextToken'in response_pdf_analysis:
response_pdf_analysis = client.get_document_analysis(
JobId=response['JobId'],
NextToken = response_pdf_analysis['NextToken']
)
status = response_pdf_analysis['JobStatus']
#if 'NextToken' in response_pdf_analysis:
list_pages.append(response_pdf_analysis)
### Fetch text
final_text = []
final_clean_text = []
for page in list_pages:
for i, text in enumerate(page['Blocks']):
if text['BlockType'] == 'LINE':
final_text.append(text['Text'])
final_text_join = ' '.join(final_text)
### Clean text
final_text_join = ''.join((c for c in unicodedata.normalize('NFD', final_text_join)
if unicodedata.category(c) != 'Mn'))
final_text_join = re.sub('[^A-Za-z0-9]+', ' ', final_text_join).lower()
to_check = ' '.join([word for word in final_text_join.split(' ') if word not in final_stopwords_list])
print(len(to_check))
COUNT_WORD = len(final_text_join.split(' '))
### Fetch language
response = client_language.detect_dominant_language(
Text=to_check[:5000],
)
LANGUAGE = response['Languages'][0]['LanguageCode']
### Fetch keywords
response = client_language.detect_key_phrases(
Text=to_check[:5000],
LanguageCode=LANGUAGE
)
KEYWORDS = [i['Text'] for i in response['KeyPhrases'] if i['Score']> .9]
### Update database
uri = 'https://coda.io/apis/v1/docs/XX/tables/XX/rows/{}'.format(ID_ROW)
payload = {
'row': {
'cells': [
{'column': 'LANGUAGE','value': LANGUAGE},
{'column': 'TEXT','value': final_text_join},
{'column': 'TEXT_CLEAN', 'value': to_check},
{'column': 'KEYWORDS', 'value': KEYWORDS},
{'column': 'NB_WORDS', 'value': COUNT_WORD},
{'column': 'STATUS_PDF', 'value': 'updated'},
],
},
}
req = requests.put(uri, headers=headers_coda, json=payload)
res = req.json()
list_payload.append({'payload':payload, 'response':res})