commit 6c6c83730103e7b72e6c7d508156ecfa180dfe3a Author: Hadley Rich Date: Mon Nov 11 10:26:47 2024 +1300 Initial diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3f03f07 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,62 @@ +# Ignore Python bytecode files +__pycache__/ +*.py[cod] + +# Ignore distribution / packaging files +build/ +dist/ +*.egg-info/ +.eggs/ +wheels/ + +# Ignore virtual environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Ignore test and coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Ignore Jupyter Notebook checkpoints +.ipynb_checkpoints + +# Ignore IPython profile directories +profile_default/ +ipython_config.py + +# Ignore pyenv files +.python-version + +# Ignore Poetry lock file +poetry.lock + +# Ignore VS Code settings +.vscode/ + +# Ignore Dockerfile and Dockerignore itself +Dockerfile +.dockerignore + +# Ignore Git files +.git/ +.gitignore + +# Ignore logs and temporary files +*.log +*.tmp \ No newline at end of file diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..5a914ef --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,34 @@ +name: Build and Publish Docker Image + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker registry + uses: docker/login-action@v3 + with: + registry: git.nice.net.nz + username: hads + password: ${{ secrets.PAT }} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: git.nice.net.nz/hads/hinpdof:latest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..564d55d --- /dev/null +++ b/.gitignore @@ -0,0 +1,97 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# poetry +poetry.lock + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# VS Code +.vscode/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# Pyre type checker +.pyre/ + +# End of https://www.toptal.com/developers/gitignore/api/python \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4a458b8 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,25 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-toml + - id: check-json + - id: check-merge-conflict + - id: debug-statements + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.7.3 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + - repo: local + hooks: + - id: tests + name: run tests + require_serial: true + entry: pytest -v tests + language: system + types: [python] + stages: [pre-push] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6352368 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,57 @@ +FROM python:3.13-slim AS base + +FROM base AS builder + +ARG DEBIAN_FRONTEND="noninteractive" + +RUN pip install poetry==1.8.3 + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONPATH=/app \ + POETRY_NO_INTERACTION=1 \ + POETRY_VIRTUALENVS_IN_PROJECT=1 \ + POETRY_VIRTUALENVS_CREATE=1 \ + POETRY_CACHE_DIR=/tmp/poetry_cache + +WORKDIR /app + +COPY pyproject.toml poetry.lock ./ + +RUN poetry install --without dev --no-root --compile + +FROM base + +RUN apt update \ + && apt dist-upgrade -y \ + && apt install -y --no-install-recommends \ + libpango-1.0-0 \ + libpangoft2-1.0-0 \ + libharfbuzz-subset0 \ + && rm -rf /var/lib/apt/lists/* + +RUN adduser --system --uid 1000 --group app +USER app + +WORKDIR /app + +ENV VIRTUAL_ENV=/app/.venv \ + PATH="/app/.venv/bin:$PATH" + +COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} + +COPY app.py ./ + +ARG PORT=8080 +ENV PORT=$PORT +EXPOSE $PORT + +CMD ["sh", "-c", \ + "exec gunicorn --preload \ + --bind 0.0.0.0:${PORT} \ + --workers 2 \ + --threads 2 \ + --worker-tmp-dir /dev/shm \ + --access-logfile - \ + --forwarded-allow-ips '*' \ + app:app"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..dd8102e --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Bear Su + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..542c7d7 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# hinpdof + +`hinpdof` is a FastAPI-based app that converts HTML content to PDF using WeasyPrint. + +## Features + +- Convert HTML content to PDF +- Customizable PDF filenames +- Health check endpoint + +## Requirements + +- Python 3.12+ +- FastAPI +- WeasyPrint +- Uvicorn + +## Installation + +1. Clone the repository: + + ```sh + git clone https://git.nice.net.nz/hinpdof.git + cd hinpdof + ``` + +2. Install dependencies using Poetry: + + ```sh + poetry install + ``` + +3. Run the application: + + ```sh + poetry run uvicorn app:app --reload + ``` + +## Usage + +### Convert HTML to PDF + +Send a POST request to `/pdf` with the following JSON body: + +```json +{ + "html": "

Hello, World!

", + "filename": "testfile" +} +``` diff --git a/app.py b/app.py new file mode 100644 index 0000000..d4a426a --- /dev/null +++ b/app.py @@ -0,0 +1,102 @@ +import io +import logging +import re + +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, StreamingResponse +from pydantic import BaseModel, Field +from weasyprint import HTML + +# Initialize logging +logging.basicConfig(level=logging.INFO) + + +class PdfRequest(BaseModel): + html: str = Field(..., min_length=1, description="HTML content to convert to PDF") + filename: str | None = None + + +app = FastAPI() + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# Compile the regular expression once +FILENAME_SANITIZE_REGEX = re.compile(r"[^a-zA-Z0-9_\-]") + + +def sanitize_filename(filename: str) -> str: + """ + Sanitize the filename by replacing invalid characters with underscores. + + Args: + filename (str): The original filename. + + Returns: + str: The sanitized filename. + """ + return FILENAME_SANITIZE_REGEX.sub("_", filename) + + +async def pdf_generator(byte_string: bytes): + """ + Generator to yield PDF content in chunks. + + Args: + byte_string (bytes): The PDF content as bytes. + + Yields: + bytes: Chunks of the PDF content. + """ + byte_stream = io.BytesIO(byte_string) + chunk = byte_stream.read(4096) + while chunk: + yield chunk + chunk = byte_stream.read(4096) + + +@app.post("/pdf") +async def pdf(body: PdfRequest): + """ + Endpoint to convert HTML content to a PDF file. + + Args: + body (PdfRequest): Request body containing HTML content and an optional filename. + + Returns: + StreamingResponse: A streaming response with the generated PDF file. + """ + logging.info("Received request to generate PDF") + try: + byte_string = HTML(string=body.html).write_pdf() + except Exception as e: + logging.error(f"Error generating PDF: {e}") + raise HTTPException(status_code=400, detail="Invalid HTML input") + + filename = sanitize_filename(body.filename.strip() if body.filename else "hinpdof") + + headers = { + "Content-Type": "application/pdf", + "Content-Disposition": f'attachment; filename="{filename}.pdf"', + } + + logging.info(f"PDF generated successfully: {filename}.pdf") + return StreamingResponse(pdf_generator(byte_string), headers=headers) + + +@app.get("/health") +async def health_check(): + """ + Endpoint to check the health status of the application. + + Returns: + JSONResponse: A JSON response with the status of the application. + """ + return JSONResponse(content={"status": "ok"}) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3642aba --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[tool.poetry] +name = "hinpdof" +version = "0.1.0" +license = "MIT" +description = "Uses FastAPI to expose a REST API which takes HTML as input and converts to PDF output using Weasyprint" +authors = ["Hadley Rich "] +readme = "README.md" +package-mode = false + +[tool.poetry.dependencies] +python = "^3.12" +fastapi = "^0.115.4" +weasyprint = "^63.0" +uvicorn = "^0.32.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.3" +httpx = "^0.27.2" +pytest-cov = "^4.0.0" +ruff = "^0.0.289" +pre-commit = "^3.4.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +line-length = 88 +target-version = "py312" +select = ["E", "F", "W", "C", "N", "B"] +ignore = ["E501"] diff --git a/test_app.py b/test_app.py new file mode 100644 index 0000000..806a559 --- /dev/null +++ b/test_app.py @@ -0,0 +1,82 @@ +import pytest +from fastapi.testclient import TestClient + +from app import app + + +@pytest.fixture +def client(): + return TestClient(app) + + +def test_health_check(client): + response = client.get("/health") + assert response.status_code == 200 + assert response.json() == {"status": "ok"} + + +def test_pdf_generation(client): + request_data = {"html": "

Hello, World!

", "filename": "testfile"} + response = client.post("/pdf", json=request_data) + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert ( + response.headers["Content-Disposition"] == 'attachment; filename="testfile.pdf"' + ) + + +def test_pdf_generation_default_filename(client): + request_data = {"html": "

Hello, World!

", "filename": None} + response = client.post("/pdf", json=request_data) + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert ( + response.headers["Content-Disposition"] == 'attachment; filename="hinpdof.pdf"' + ) + + +def test_pdf_generation_invalid_html(client): + request_data = {"html": "", "filename": "testfile"} + response = client.post("/pdf", json=request_data) + assert response.status_code == 422 # Unprocessable Entity due to invalid input + + +def test_pdf_generation_missing_html(client): + request_data = {"filename": "testfile"} + response = client.post("/pdf", json=request_data) + assert ( + response.status_code == 422 + ) # Unprocessable Entity due to missing required field + + +def test_pdf_generation_large_html(client): + large_html = "

" + "Hello, World! " * 1000 + "

" + request_data = {"html": large_html, "filename": "largefile"} + response = client.post("/pdf", json=request_data) + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert ( + response.headers["Content-Disposition"] + == 'attachment; filename="largefile.pdf"' + ) + + +def test_pdf_generation_invalid_filename(client): + request_data = {"html": "

Hello, World!

", "filename": "invalid/filename"} + response = client.post("/pdf", json=request_data) + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert ( + response.headers["Content-Disposition"] + == 'attachment; filename="invalid_filename.pdf"' + ) + + +def test_pdf_generation_missing_filename(client): + request_data = {"html": "

Hello, World!

"} + response = client.post("/pdf", json=request_data) + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert ( + response.headers["Content-Disposition"] == 'attachment; filename="hinpdof.pdf"' + )