#****************************************************************************************** # Author - Nirmallya Mukherjee # This lambda function will open an S3 trigger JSON, check the bucket and file details # Use the S3 api to read the PDF file and use the tika library to extract the text content # Log the text content and metadata in cloudwatch # # Imp - This function will need a timeout of 2mins (depending on the PDF size) and mem of 256MB # Create a custom role for lambda with the following specifications # "lambda-multirole" with "CloudWatchFullAccess" and "AmazonS3FullAccess" policies # #****************************************************************************************** import boto3 import json import os import logging from pip._internal.utils.misc import get_installed_distributions logger = logging.getLogger() logger.setLevel(logging.INFO) required_pkgs = ['tika'] def lambda_handler(event, context): logger.info('********************** Environment and Event variables are *********************') logger.info(os.environ) logger.info(event) installed_pkgs = [pkg.key for pkg in get_installed_distributions()] for package in required_pkgs: if package not in installed_pkgs: logger.error('Apache Tika dependency is not there. Exiting') return { 'statusCode': 500, 'body': json.dumps('Missing dependencies, aborting!') } logger.info('All dependencies found, environment is looking good. Proceeding ...') extract_content(event) return { 'statusCode': 200, 'body': json.dumps('Execution is now complete') } def extract_content(event): import tika from tika import parser try: #Read the target bucket from the lambda environment variable targetBucket = os.environ['TARGET_BUCKET'] except: targetBucket = "skl-dest" print('Target bucket is', targetBucket) bucket = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] print('The s3 bucket is', bucket, 'and the file name is', key) s3client = boto3.client('s3') response = s3client.get_object(Bucket=bucket, Key=key) pdffile = response["Body"] print('The binary pdf file type is', type(pdffile)) rawcontent = parser.from_buffer(pdffile) print('The raw PDF content type is', type(rawcontent)) meta = rawcontent["metadata"] print('Metadata is', meta) content = rawcontent['content'] #After the content extraction there are too many \n at the top of the content; remove them all content = content.replace("\n\n", "") print('Content is', content) s3client.put_object(Bucket=targetBucket, Key=key+".txt", Body=content) print('All done, returning from extract content method')