Let's analyze and improve the Dockerfile:
# Stage 1: Base image
FROM public.ecr.aws/lambda/python:3.12 as base
# Install dependencies in a single layer with improved organization
RUN dnf install -y \
# Core utilities
unzip \
# Audio and UI dependencies
alsa-lib \
atk \
at-spi2-atk \
cups-libs \
dbus-glib \
dbus-glib-devel \
gtk3 \
# X11 libraries
libXcomposite \
libXcursor \
libXdamage \
libXext \
libXi \
libXrandr \
libXScrnSaver \
libXt \
libXtst \
# System dependencies
mesa-libgbm \
nss \
pango \
# Fonts for proper rendering
google-noto-sans-cjk-ttc-fonts \
google-noto-serif-cjk-ttc-fonts \
# X11 server components
xorg-x11-server-Xvfb \
xorg-x11-xauth \
&& dnf clean all \
&& rm -rf /var/cache/dnf
# Download Chrome and ChromeDriver in a separate layer
RUN cd /tmp \
&& curl -Lo "chromedriver-linux64.zip" "https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.91/linux64/chromedriver-linux64.zip" \
&& curl -Lo "chrome-headless-shell-linux64.zip" "https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.91/linux64/chrome-headless-shell-linux64.zip" \
&& unzip chromedriver-linux64.zip -d /opt/ \
&& unzip chrome-headless-shell-linux64.zip -d /opt/ \
&& rm -f *.zip
# Copy and install requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir
FROM base as app
# Copy application code
COPY lambda_function.py .
# Set the Lambda handler
CMD [ "lambda_function.lambda_handler" ]
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Configure logging
CHROME_PATH = "/opt/chrome-headless-shell-linux64/chrome-headless-shell"
CHROME_DRIVER_PATH = "/opt/chromedriver-linux64/chromedriver"
def create_driver():
"""Create and configure Chrome WebDriver"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless=new') # Using new headless mode
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-dev-tools')
chrome_options.add_argument('--single-process')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.binary_location = CHROME_PATH
service = Service(
executable_path=CHROME_DRIVER_PATH
)
return webdriver.Chrome(
service=service,
options=chrome_options
)
def lambda_handler(event, context):
"""Lambda handler for Selenium-based web automation"""
driver = None
try:
driver = create_driver()
# Navigate to the target URL
url = event.get('url', 'https://www.google.com')
driver.get(url)
# Wait for page load and get information
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
page_info = {
'title': driver.title,
'url': driver.current_url,
'page_source_length': len(driver.page_source)
}
return {
'statusCode': 200,
'body': page_info
}
except Exception as e:
return {
'statusCode': 500,
'body': {
'error': str(e)
}
}
finally:
if driver:
driver.quit()
Important notes about the build process:
# Build on Linux/GitHub Actions/CodeBuild (NOT on macOS)
docker build -t selenium-lambda .
docker tag selenium-lambda:latest ${ECR_REGISTRY}/selenium-lambda:latest
docker push ${ECR_REGISTRY}/selenium-lambda:latest
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# Add to your code:
logger.info("Starting browser session")
logger.error("Error occurred", exc_info=True)
Running Selenium with Chrome in AWS Lambda requires careful consideration of various factors, including system dependencies, resource constraints, and deployment processes. By following these guidelines and best practices, you can create a robust and efficient serverless web automation solution.