diff --git a/.env-exemple b/.env-exemple new file mode 100644 index 0000000..9a521a4 --- /dev/null +++ b/.env-exemple @@ -0,0 +1,9 @@ +# Azure Document Intelligence +AZURE_DOC_ENDPOINT= +AZURE_DOC_KEY= + +# Azure OpenAI +AZURE_OPENAI_ENDPOINT= +AZURE_OPENAI_KEY= +AZURE_DEPLOYMENT= +API_VERSION= \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a83262e --- /dev/null +++ b/Makefile @@ -0,0 +1,164 @@ +# Makefile for Geotechnical Layer Extractor +.PHONY: help setup single test-optimizer clean test info + +.DEFAULT_GOAL := help + +# Variables +PYTHON := python3 +VENV := venv +VENV_BIN := $(VENV)/bin +PIP := $(VENV_BIN)/pip +PYTHON_VENV := $(VENV_BIN)/python + +# Optimizer variables +EXAMPLES_DIR ?= examples +EXPECTED_DIR ?= output_man + +help: + @echo "đŸȘš Geotechnical Layer Extractor" + @echo "================================" + @echo "" + @echo "📩 Setup:" + @echo " make setup - Install all dependencies and create venv" + @echo "" + @echo "🚀 Running:" + @echo " make single PDF= - Extract layers from a single PDF" + @echo " make test-optimizer - Run prompt optimization on test PDFs" + @echo "" + @echo "đŸ§č Maintenance:" + @echo " make clean - Clean all generated files" + @echo " make test - Test Azure connections" + @echo " make info - Show system information" + @echo "" + @echo "Examples:" + @echo " make single PDF=document.pdf" + @echo " make single PDF=examples/sample.pdf" + @echo " make test-optimizer" + +# Create virtual environment +venv: + @echo "🐍 Creating virtual environment..." + @$(PYTHON) -m venv $(VENV) + @echo "✅ Virtual environment created!" + +# Setup - Install all dependencies +setup: venv + @echo "📩 Installing dependencies..." + @$(PIP) install --upgrade pip + @$(PIP) install -r requirements.txt + @mkdir -p output + @mkdir -p output_man + @mkdir -p examples + @echo "✅ Setup complete!" + @echo "" + @echo "⚠ Don't forget to:" + @echo "1. Add your Azure keys in the scripts" + @echo "2. Add test PDFs in examples/ folder" + @echo "3. Add expected results in output_man/ folder" + +# Extract layers from a single PDF +single: + @if [ -z "$(PDF)" ]; then \ + echo "❌ Please provide a PDF file: make single PDF=your_file.pdf"; \ + exit 1; \ + fi + @if [ ! -f "$(PDF)" ]; then \ + echo "❌ File not found: $(PDF)"; \ + exit 1; \ + fi + @if [ ! -f "extract_single.py" ]; then \ + echo "❌ extract_single.py not found!"; \ + exit 1; \ + fi + @echo "🚀 Extracting layers from: $(PDF)" + @$(PYTHON_VENV) extract_single.py "$(PDF)" --pretty + +# Run prompt optimizer +test-optimizer: + @echo "🔬 Running Prompt Optimizer" + @echo "==============================" + @echo "📁 Examples directory: $(EXAMPLES_DIR)" + @echo "📁 Expected results: $(EXPECTED_DIR)" + @echo "" + @if [ ! -f "prompt_optimizer.py" ]; then \ + echo "❌ prompt_optimizer.py not found!"; \ + exit 1; \ + fi + @if [ ! -d "$(EXAMPLES_DIR)" ]; then \ + echo "❌ Examples directory not found: $(EXAMPLES_DIR)"; \ + echo "💡 Create it with: mkdir -p $(EXAMPLES_DIR)"; \ + exit 1; \ + fi + @if [ ! -d "$(EXPECTED_DIR)" ]; then \ + echo "❌ Expected results directory not found: $(EXPECTED_DIR)"; \ + echo "💡 Create it with: mkdir -p $(EXPECTED_DIR)"; \ + exit 1; \ + fi + @echo "🚀 Starting optimization..." + @$(PYTHON_VENV) prompt_optimizer.py \ + --examples-dir $(EXAMPLES_DIR) \ + --expected-dir $(EXPECTED_DIR) + +# Test Azure connections +test: + @echo "đŸ§Ș Testing Azure connections..." + @$(PYTHON_VENV) -c "from azure.ai.formrecognizer import DocumentAnalysisClient; \ + from azure.core.credentials import AzureKeyCredential; \ + from openai import AzureOpenAI; \ + print('✅ Azure imports successful')" + @echo "✅ All imports working!" + +# Clean all generated files +clean: + @echo "đŸ§č Cleaning all files..." + @rm -rf $(VENV) + @rm -rf __pycache__ */__pycache__ + @rm -rf .pytest_cache + @rm -rf *.pyc */*.pyc + @rm -rf output/*.json output/*.txt output/*.html + @rm -rf *.log + @rm -rf test_results.json + @find . -type f -name ".DS_Store" -delete 2>/dev/null || true + @echo "✅ Clean complete!" + +# Show system information +info: + @echo "📊 System Information:" + @echo "======================" + @echo "OS: $$(uname -s)" + @echo "Python: $$($$(which $(PYTHON)) --version)" + @if [ -d "$(VENV)" ]; then \ + echo "Virtual env: ✅ $(VENV) exists"; \ + echo "Pip version: $$($(PIP) --version 2>/dev/null || echo 'N/A')"; \ + else \ + echo "Virtual env: ❌ Not created (run 'make setup')"; \ + fi + @echo "" + @echo "📁 Project structure:" + @if [ -f "extract_single.py" ]; then \ + echo "✅ extract_single.py found"; \ + else \ + echo "❌ extract_single.py missing"; \ + fi + @if [ -f "prompt_optimizer.py" ]; then \ + echo "✅ prompt_optimizer.py found"; \ + else \ + echo "❌ prompt_optimizer.py missing"; \ + fi + @echo "" + @echo "📁 Directories:" + @if [ -d "output" ]; then \ + echo "✅ output/ exists ($$(ls -1 output/ 2>/dev/null | wc -l | tr -d ' ') files)"; \ + else \ + echo "❌ output/ not found"; \ + fi + @if [ -d "$(EXAMPLES_DIR)" ]; then \ + echo "✅ $(EXAMPLES_DIR)/ exists ($$(ls -1 $(EXAMPLES_DIR)/*.pdf 2>/dev/null | wc -l | tr -d ' ') PDFs)"; \ + else \ + echo "❌ $(EXAMPLES_DIR)/ not found"; \ + fi + @if [ -d "$(EXPECTED_DIR)" ]; then \ + echo "✅ $(EXPECTED_DIR)/ exists ($$(ls -1 $(EXPECTED_DIR)/*_layers.json 2>/dev/null | wc -l | tr -d ' ') JSON files)"; \ + else \ + echo "❌ $(EXPECTED_DIR)/ not found"; \ + fi \ No newline at end of file diff --git a/README.md b/README.md index e69de29..e0ead6e 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,330 @@ +# đŸȘš Geotechnical Layer Extractor + +Un outil d'extraction automatique des couches gĂ©ologiques Ă  partir de logs de forage au format PDF, utilisant l'intelligence artificielle pour analyser et structurer les donnĂ©es gĂ©otechniques. + +## 🎯 Objectif + +Ce projet permet d'extraire automatiquement les informations de couches gĂ©ologiques (profondeurs et descriptions) depuis des documents PDF de logs de forage, souvent complexes et mal structurĂ©s. Il utilise Azure Document Intelligence pour l'OCR et GPT-4 pour l'analyse sĂ©mantique du contenu. + +## ✹ FonctionnalitĂ©s + +- **Extraction OCR avancĂ©e** : Utilise Azure Document Intelligence pour extraire texte et tableaux +- **Analyse par IA** : GPT-4 analyse le contenu pour identifier les couches gĂ©ologiques +- **Gestion multi-pages** : Traite correctement les documents de plusieurs pages +- **DĂ©tection de tableaux** : Identifie et parse les tableaux complexes +- **DĂ©duplication intelligente** : Évite les doublons entre pages +- **Optimisation de prompts** : Outil pour amĂ©liorer automatiquement les prompts +- **Export JSON structurĂ©** : Format de sortie standardisĂ© et exploitable + +## 📋 PrĂ©requis + +- Python 3.8 ou supĂ©rieur +- Compte Azure avec accĂšs Ă  : + - Azure Document Intelligence (Form Recognizer) + - Azure OpenAI avec accĂšs GPT-4 + +## 🚀 Installation + +### 1. Cloner le repository + +```bash +git clone https://git.linaster.online/vincent.morisseau/geotech.git +cd geotech +``` + +### 2. Installer les dĂ©pendances + +```bash +make setup +``` + +Cela va : +- CrĂ©er un environnement virtuel Python +- Installer tous les packages nĂ©cessaires +- CrĂ©er les dossiers requis + +### 3. Configuration des variables d'environnement + +⚠ **IMPORTANT** : Les clĂ©s API ne doivent jamais ĂȘtre commitĂ©es dans Git ! + +1. **Copier le fichier d'exemple** : +```bash +cp .env.example .env +``` + +2. **Éditer le fichier `.env`** et remplir vos clĂ©s API Azure : +```bash +nano .env # ou utilisez votre Ă©diteur prĂ©fĂ©rĂ© +``` + +3. **Remplir les variables suivantes** dans le fichier `.env` : +```env +# Azure Document Intelligence +AZURE_DOC_ENDPOINT=https://your-resource.cognitiveservices.azure.com/ +AZURE_DOC_KEY=your-document-intelligence-key-here + +# Azure OpenAI +AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ +AZURE_OPENAI_KEY=your-openai-key-here +AZURE_DEPLOYMENT=gpt-4o-mini # ou gpt-4o, gpt-4, etc. +API_VERSION=2024-12-01-preview +``` + +⚠ **Note de sĂ©curitĂ©** : Le fichier `.env` est automatiquement ignorĂ© par Git (via `.gitignore`) pour protĂ©ger vos clĂ©s API. + +### 4. Obtenir les clĂ©s Azure + +#### Azure Document Intelligence + +1. **CrĂ©er une ressource dans le [portail Azure](https://portal.azure.com)** + - Home → Create a resource + - IA + Machine Learning → Document Intelligence (Form Recognizer) + - Choisir la rĂ©gion (ex: East US 2) + - Pricing tier : S0 (Standard) + +2. **RĂ©cupĂ©rer les credentials** + - Aller dans votre ressource → Keys and Endpoint + - Copier `KEY 1` et `Endpoint` + - Coller ces valeurs dans votre fichier `.env` + +#### Azure OpenAI + +1. **Demander l'accĂšs Ă  Azure OpenAI** (si nĂ©cessaire) + - [Formulaire de demande d'accĂšs](https://azure.microsoft.com/en-us/products/ai-services/openai-service) + - DĂ©lai : 24-48h gĂ©nĂ©ralement + +2. **CrĂ©er une ressource Azure OpenAI** + - Home → Create a resource → Azure OpenAI + - Choisir la rĂ©gion (recommandĂ© : Sweden Central ou East US 2) + +3. **DĂ©ployer un modĂšle GPT** + - AccĂ©der Ă  Azure OpenAI Studio depuis votre ressource + - Deployments → Create new deployment + - Choisir votre modĂšle : + - `gpt-4o-mini` : Économique, 0,003$/page + - `gpt-4o` : Meilleur rapport qualitĂ©/prix, 0,042$/page + - `gpt-4` : Plus cher mais plus puissant, 0,21$/page + +4. **RĂ©cupĂ©rer les credentials** + - Dans Azure OpenAI Studio → Deployments → View code + - Copier l'endpoint, la clĂ© et le nom du dĂ©ploiement + - Coller ces valeurs dans votre fichier `.env` + +### 5. VĂ©rifier la configuration + +```bash +# Test de connexion aux services Azure +make test + +# Ou vĂ©rifier manuellement +python -c "from dotenv import load_dotenv; import os; load_dotenv(); print('✅ OK' if os.getenv('AZURE_DOC_KEY') else '❌ .env non configurĂ©')" +``` + +## 📖 Utilisation + +### Extraction d'un seul PDF + +```bash +# Extraction simple (sauvegarde dans output/document.json) +make single PDF=document.pdf + +# Ou directement avec le script +python extract_single.py document.pdf + +# SpĂ©cifier un fichier de sortie custom +python extract_single.py document.pdf -o results/custom.json + +# Avec formatage JSON lisible +python extract_single.py document.pdf --pretty +``` + +### Optimisation des prompts + +Pour amĂ©liorer la qualitĂ© d'extraction sur un ensemble de PDFs de test : + +1. Placer les PDFs de test dans `examples/` +2. Placer les rĂ©sultats attendus dans `output_man/` (format : `1_layers.json` pour `1.pdf`) +3. Lancer le test : + +```bash +make test-optimizer +``` + +### Test des connexions Azure + +```bash +make test +``` + +## 📁 Structure du projet + +``` +geotechnical-extractor/ +├── .env.example # Fichier d'exemple pour les variables d'environnement +├── .env # VOS clĂ©s API (ignorĂ© par Git) +├── .gitignore # Fichiers Ă  ignorer (inclut .env) +├── extract_single.py # Script principal d'extraction +├── prompt_optimizer.py # Optimiseur de prompts +├── requirements.txt # DĂ©pendances Python +├── Makefile # Commandes simplifiĂ©es +├── README.md # Ce fichier +├── examples/ # PDFs de test pour l'optimiseur +│ ├── 1.pdf +│ ├── 2.pdf +│ └── ... +├── output_man/ # RĂ©sultats attendus (vĂ©ritĂ© terrain) +│ ├── 1_layers.json +│ ├── 2_layers.json +│ └── ... +└── output/ # RĂ©sultats gĂ©nĂ©rĂ©s + ├── document.json + └── ... +``` + +## 📊 Format de sortie + +Les rĂ©sultats sont sauvegardĂ©s en JSON avec la structure suivante : + +```json +{ + "metadata": { + "source_file": "/path/to/document.pdf", + "extraction_date": "2025-08-06T10:30:45.123456", + "extraction_method": "Azure Document Intelligence + GPT-4", + "pages": 6, + "layer_count": 12, + "extraction_time_seconds": 23.5 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Terre vĂ©gĂ©tale, brun foncĂ©, prĂ©sence de racines" + }, + { + "page": 1, + "depth_start": 0.3, + "depth_end": 2.5, + "description": "Argile limoneuse, gris-beige, plastique, humide" + } + ] +} +``` + +### Description des champs + +- **metadata** : Informations sur l'extraction + - `source_file` : Chemin absolu du PDF source + - `extraction_date` : Date et heure de l'extraction + - `pages` : Nombre de pages traitĂ©es + - `layer_count` : Nombre de couches extraites + - `extraction_time_seconds` : DurĂ©e de l'extraction + +- **layers** : Liste des couches gĂ©ologiques + - `page` : NumĂ©ro de page oĂč apparaĂźt la couche + - `depth_start` : Profondeur de dĂ©but en mĂštres + - `depth_end` : Profondeur de fin en mĂštres + - `description` : Description complĂšte du matĂ©riau + +## 💰 CoĂ»ts estimĂ©s + +Entre ~0,003€ et ~0,21€ par page selon le modĂšle. + +**Recommandation** : Utilisez GPT-4o pour le meilleur rapport qualitĂ©/prix + +## 🚹 RĂ©solution des problĂšmes + +### "Missing Azure credentials" +- VĂ©rifier que le fichier `.env` existe +- VĂ©rifier que toutes les variables sont remplies dans `.env` +- S'assurer d'avoir copiĂ© `.env.example` vers `.env` + +### "Resource not found" +- VĂ©rifier que l'endpoint Azure est correct (avec ou sans `/` final) +- S'assurer que la ressource est dĂ©ployĂ©e et active + +### "Invalid API key" +- VĂ©rifier que vous utilisez la bonne clĂ© +- Les clĂ©s Document Intelligence et OpenAI sont diffĂ©rentes +- VĂ©rifier qu'il n'y a pas d'espaces dans les clĂ©s + +### "Model not found" +- VĂ©rifier le nom exact du dĂ©ploiement GPT dans `.env` +- S'assurer que le modĂšle est dĂ©ployĂ© dans votre rĂ©gion +- Le nom est sensible Ă  la casse + +### Extraction incomplĂšte +- Augmenter la limite de caractĂšres dans le script +- VĂ©rifier la qualitĂ© du PDF (scan lisible) +- Utiliser GPT-4o au lieu de GPT-4o-mini + +### Pages manquantes +- Le script traite maintenant jusqu'Ă  8000 caractĂšres par page +- Les tableaux sont marquĂ©s avec `[TABLE DATA]` pour une meilleure dĂ©tection + +## đŸ› ïž Personnalisation + +### Modifier les prompts + +Les prompts sont dĂ©finis dans la mĂ©thode `get_prompts()` des scripts. Vous pouvez les adapter selon vos besoins spĂ©cifiques. + +### Ajuster les limites + +```python +# Augmenter la limite de caractĂšres par page +max_chars = 10000 # Au lieu de 8000 + +# Augmenter les tokens de rĂ©ponse +max_tokens=6000 # Au lieu de 4000 +``` + +### Variables d'environnement supplĂ©mentaires + +Vous pouvez ajouter dans votre `.env` : + +```env +# Optionnel : niveau de log +LOG_LEVEL=DEBUG # ou INFO, WARNING, ERROR + +# Optionnel : taille des chunks pour les PDFs volumineux +CHUNK_SIZE=2 # Nombre de pages par chunk +``` + +### Langues supportĂ©es + +Les prompts actuels sont en français mais peuvent ĂȘtre traduits. Le systĂšme supporte tous les documents que Document Intelligence peut traiter. + +## đŸ§č Maintenance + +```bash +# Nettoyer tous les fichiers gĂ©nĂ©rĂ©s +make clean + +# Voir les informations systĂšme +make info +``` + +## 📈 Performances + +- **Temps moyen** : 3-5 secondes par page +- **PrĂ©cision** : ~85-95% selon la complexitĂ© du document +- **Limite** : PDFs jusqu'Ă  500 pages (limite Azure) + +## 🔒 SĂ©curitĂ© + +- **Ne jamais committer le fichier `.env`** contenant vos clĂ©s API +- Le fichier `.env` est automatiquement ignorĂ© par Git +- Utilisez `.env.example` comme template pour partager la structure +- Changez rĂ©guliĂšrement vos clĂ©s API Azure +- Utilisez des variables d'environnement en production + +## 📝 Licence + +Ce projet est sous licence VCGI par Tom BLANCHETON. + +## 🙏 Remerciements + +- Azure Document Intelligence pour l'OCR de qualitĂ© +- OpenAI pour GPT-4 +- La communautĂ© gĂ©otechnique pour les retours et tests diff --git a/examples/1.pdf b/examples/1.pdf new file mode 100644 index 0000000..648df4a Binary files /dev/null and b/examples/1.pdf differ diff --git a/examples/2.pdf b/examples/2.pdf new file mode 100644 index 0000000..526d40a Binary files /dev/null and b/examples/2.pdf differ diff --git a/examples/3.pdf b/examples/3.pdf new file mode 100644 index 0000000..04a40ee Binary files /dev/null and b/examples/3.pdf differ diff --git a/examples/4.pdf b/examples/4.pdf new file mode 100644 index 0000000..1484450 Binary files /dev/null and b/examples/4.pdf differ diff --git a/examples/5.pdf b/examples/5.pdf new file mode 100644 index 0000000..0bd4081 Binary files /dev/null and b/examples/5.pdf differ diff --git a/examples/6.pdf b/examples/6.pdf new file mode 100644 index 0000000..4ea0e16 Binary files /dev/null and b/examples/6.pdf differ diff --git a/examples/7.pdf b/examples/7.pdf new file mode 100644 index 0000000..76a5e0e Binary files /dev/null and b/examples/7.pdf differ diff --git a/examples/8.pdf b/examples/8.pdf new file mode 100644 index 0000000..5826f96 Binary files /dev/null and b/examples/8.pdf differ diff --git a/extract_single.py b/extract_single.py new file mode 100644 index 0000000..2b16f8c --- /dev/null +++ b/extract_single.py @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 +""" +Single PDF Layer Extractor - Extract geological layers from a single PDF +""" +import click +import json +import logging +import PyPDF2 +import os + +from dotenv import load_dotenv +from pathlib import Path +from typing import List, Dict, Tuple +from datetime import datetime +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from openai import AzureOpenAI + +load_dotenv() + +# Configuration Azure Document Intelligence +AZURE_DOC_ENDPOINT = os.getenv("AZURE_DOC_ENDPOINT") +AZURE_DOC_KEY = os.getenv("AZURE_DOC_KEY") + +# Configuration Azure OpenAI +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") +AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") +AZURE_DEPLOYMENT = os.getenv("AZURE_DEPLOYMENT", "gpt-4.1") +API_VERSION = os.getenv("API_VERSION", "2024-12-01-preview") + +if not all([AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY]): + raise ValueError( + "❌ Missing Azure credentials! Please check your .env file.\n" + "Required variables: AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY" + ) + +# Configuration du logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +logger.info("✅ Environment variables loaded successfully") +logger.info(f"📍 Document Intelligence endpoint: {AZURE_DOC_ENDPOINT}") +logger.info(f"đŸ€– OpenAI deployment: {AZURE_DEPLOYMENT}") + +class SinglePDFExtractor: + """Extract geological layers from a single PDF""" + + def __init__(self): + """Initialize Azure clients""" + self.doc_client = DocumentAnalysisClient( + endpoint=AZURE_DOC_ENDPOINT, + credential=AzureKeyCredential(AZURE_DOC_KEY) + ) + self.openai_client = AzureOpenAI( + azure_endpoint=AZURE_OPENAI_ENDPOINT, + api_key=AZURE_OPENAI_KEY, + api_version=API_VERSION + ) + + def get_prompts(self) -> Tuple[str, str]: + """Get the system and user prompts""" + system_prompt = """Tu es un expert gĂ©otechnicien spĂ©cialisĂ© dans l'analyse de logs de forage. +Ta tĂąche est d'extraire UNIQUEMENT les couches gĂ©ologiques avec leurs profondeurs et descriptions COMPLÈTES. + +RÈGLES CRITIQUES: +1. Chaque couche DOIT avoir une profondeur de dĂ©but et de fin en mĂštres +2. La description doit contenir TOUT le texte associĂ© Ă  la couche (mĂȘme sur plusieurs lignes) +3. Notation dĂ©cimale: 0,3 ou 0.3 = 0.3m (PAS 3m!) +4. Profondeurs typiques: 0-0.3m, 0.3-1m, 1-4m, 4-10m, etc. +5. NE PAS sĂ©parer ou classifier le matĂ©riau - tout dans la description +6. IGNORER les mots isolĂ©s qui ne sont pas dans la colonne description + +RÈGLE DE CONTINUITÉ DES COUCHES: +⚠ Une couche gĂ©ologique est CONTINUE entre deux profondeurs +⚠ Si tu vois une profondeur isolĂ©e suivie de description, cherche OBLIGATOIREMENT la profondeur de fin +⚠ Les profondeurs de fin peuvent ĂȘtre: + - Sur la mĂȘme ligne aprĂšs la description + - Sur une ligne suivante + - Au dĂ©but de la couche suivante (la fin d'une couche = dĂ©but de la suivante) +⚠ NE JAMAIS fragmenter une description continue en plusieurs couches + +GESTION DES PAGES: +⚠ Les couches peuvent continuer d'une page Ă  l'autre +⚠ Sur une nouvelle page, si la premiĂšre profondeur n'est pas 0.0m, c'est la continuation de la page prĂ©cĂ©dente +⚠ Ne pas dupliquer les couches identiques sur plusieurs pages + +SORTIE: JSON valide uniquement""" + + user_prompt_template = """Extrait toutes les couches gĂ©ologiques de cette page de log de forage. + +CE QUI EST UNE COUCHE GÉOLOGIQUE: +✓ Descriptions de matĂ©riaux: sol, roche, sable, argile, limon, remblai, terre vĂ©gĂ©tale, etc. +✓ A une plage de profondeur ET des propriĂ©tĂ©s de matĂ©riau +✓ DĂ©crit les caractĂ©ristiques physiques: couleur, consistance, humiditĂ©, altĂ©ration, granulomĂ©trie + +CE QUI N'EST PAS UNE COUCHE (IGNORER): +✗ Descriptions d'images ou lĂ©gendes +✗ MĂ©tadonnĂ©es du projet (coordonnĂ©es, dates, noms d'entreprise) +✗ Descriptions d'Ă©quipement ou de mĂ©thodologie +✗ DonnĂ©es administratives ou commentaires sur la procĂ©dure +✗ Mots isolĂ©s dans d'autres colonnes (classifications, formations gĂ©ologiques, etc.) + +EXCLUSIONS ABSOLUES - NE JAMAIS extraire: +✗ Annotations techniques isolĂ©es: CZ, JT, JTx5, etc. +✗ Mesures techniques sans description de matĂ©riau +✗ Échantillons ("Undisturbed sample", "Disturbed sample", "sample", "Ă©chantillon") +✗ Lignes contenant uniquement des angles, pressions ou mesures +✗ RĂ©fĂ©rences Ă  des essais ou tests sans description de sol + +ALGORITHME DE DÉTECTION DES PROFONDEURS: +1. Pour chaque nombre qui ressemble Ă  une profondeur (X.X m ou X,X m): + - Si suivi de "-" ou "Ă " ou "bis" → profondeur de dĂ©but, chercher la fin + - Si prĂ©cĂ©dĂ© de "-" ou "Ă " ou "bis" → profondeur de fin + - Si isolĂ© → vĂ©rifier le contexte: + * DĂ©but de description → profondeur de dĂ©but + * Fin de description → profondeur de fin + * Nouvelle ligne → dĂ©but de nouvelle couche +2. TOUJOURS vĂ©rifier la cohĂ©rence: fin couche N = dĂ©but couche N+1 +3. Si une couche n'a pas de profondeur de fin explicite, elle va jusqu'au dĂ©but de la couche suivante + +MOTIFS À RECHERCHER: +- "0.00-0.30m: Terre vĂ©gĂ©tale, brun foncĂ©..." +- "0,3 Ă  1,0m Sable fin beige..." +- "de 1m Ă  4m: Argile plastique grise..." +- "1.00 - 4.00 ARGILE limoneuse..." +- "0.3 m Description... 4.0 m" (profondeurs sĂ©parĂ©es) +- Tableaux avec colonnes profondeur/description + +IDENTIFICATION DES COLONNES DE TABLEAU: +⚠ Dans un tableau, identifier VISUELLEMENT les colonnes avant d'extraire +⚠ Colonnes Ă  IGNORER complĂštement: + - Formation gĂ©ologique (Quaternaire, Tertiaire, Quaternaire, etc.) + - Stratigraphie + - Classification (Colluvions, Alluvions, etc.) + - Symboles ou codes isolĂ©s + - Âge gĂ©ologique +⚠ Colonnes Ă  UTILISER: + - Description du matĂ©riau + - PropriĂ©tĂ©s physiques (couleur, texture, consistance) +⚠ Si doute sur une colonne, vĂ©rifier si elle contient des propriĂ©tĂ©s physiques dĂ©taillĂ©es + +RÈGLE CRITIQUE DE SÉPARATION: +⚠ Les profondeurs (0-0.3m, 0.30m, etc.) ne font JAMAIS partie de la description +⚠ Si une ligne contient "0,30 m - Mutterboden, braun", alors: + - depth_start: 0.0, depth_end: 0.3 + - description: "Mutterboden, braun" (SANS les profondeurs) +⚠ Retirer TOUS les prĂ©fixes de profondeur du champ description +⚠ NE PAS inclure les profondeurs rĂ©pĂ©tĂ©es Ă  la fin de la description + +IMPORTANT: +- Capture TOUT le texte de description, mĂȘme s'il est long ou sur plusieurs lignes +- Ne pas rĂ©sumer ou reformuler - garder le texte original +- Inclure TOUTES les propriĂ©tĂ©s mentionnĂ©es (couleur, texture, humiditĂ©, inclusions, etc.) +- EXCLURE ABSOLUMENT les profondeurs du champ description +- Si le texte commence par "0,30 m - Mutterboden", la description est SEULEMENT "Mutterboden" +- SĂ©parer clairement: profondeurs dans depth_start/depth_end, matĂ©riau et propriĂ©tĂ©s dans description +- NE PAS mĂ©langer les colonnes d'un tableau - prendre uniquement la colonne description principale + +VALIDATION OBLIGATOIRE avant de retourner le JSON: +□ Toutes les couches ont une profondeur de dĂ©but ET de fin +□ Les profondeurs sont continues (pas de trous entre les couches) +□ Les profondeurs sont cohĂ©rentes (dĂ©but < fin) +□ Les descriptions ne contiennent PAS les profondeurs +□ Les descriptions ne contiennent PAS de noms de colonnes de classification +□ Aucune annotation technique isolĂ©e n'est prĂ©sente +□ Pas de duplication de couches identiques + +Retourne ce JSON: +{{ + "layers": [ + {{ + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Texte de description SANS les profondeurs - uniquement matĂ©riau et propriĂ©tĂ©s" + }} + ] +}} + +EXEMPLE CONCRET: +Texte original: "0,00 - 0,30 m Mutterboden, dunkelbraun, humos" +RĂ©sultat attendu: +{{ + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Mutterboden, dunkelbraun, humos" +}} +PAS: "description": "0,00 - 0,30 m Mutterboden, dunkelbraun, humos" ❌ + +TEXTE DE LA PAGE {page_num}: +{text}""" + + return system_prompt, user_prompt_template + + def extract_text_from_pdf(self, pdf_path: Path) -> List[str]: + """Extract text from PDF with chunking""" + logger.info(f"🔍 Extracting text from {pdf_path.name}") + + total_pages = self.get_page_count(pdf_path) + logger.info(f"📄 PDF has {total_pages} pages") + + if total_pages == 0: + logger.error("Could not determine page count") + return [] + + # Extraire par chunks de 2 pages + pages_text = [] + chunk_size = 2 + for start_page in range(1, total_pages + 1, chunk_size): + end_page = min(start_page + chunk_size - 1, total_pages) + with open(pdf_path, "rb") as f: + poller = self.doc_client.begin_analyze_document( + model_id="prebuilt-document", + document=f, + pages=f"{start_page}-{end_page}" + ) + result = poller.result() + for page in result.pages: + page_lines = [line.content for line in (page.lines or [])] + pages_text.append('\n'.join(page_lines)) + + return pages_text + + def get_page_count(self, pdf_path: Path) -> int: + """Get the number of pages in a PDF""" + try: + with open(pdf_path, "rb") as f: + pdf_reader = PyPDF2.PdfReader(f) + return len(pdf_reader.pages) + except: + pass + return 0 + + def _extract_table_text(self, table) -> str: + """Extract text from table with better structure preservation""" + if not hasattr(table, 'cells') or not table.cells: + return "" + + max_row = max(cell.row_index for cell in table.cells) + max_col = max(cell.column_index for cell in table.cells) + table_array = [["" for _ in range(max_col + 1)] for _ in range(max_row + 1)] + + for cell in table.cells: + if hasattr(cell, 'content'): + table_array[cell.row_index][cell.column_index] = cell.content + + lines = [] + for row_idx, row in enumerate(table_array): + row_text = " | ".join(str(cell).strip() for cell in row) + if row_text.strip(): + lines.append(row_text) + # Add separator after header row + if row_idx == 0 and len(lines) == 1: + lines.append("-" * len(row_text)) + + return "\n".join(lines) + + def extract_layers_from_page(self, page_text: str, page_num: int, system_prompt: str, user_prompt_template: str) -> List[Dict]: + """Extract layers from a single page using GPT""" + if not page_text.strip(): + logger.warning(f"⚠ Page {page_num} is empty") + return [] + + max_chars = 8000 # todo + user_prompt = user_prompt_template.format( + page_num=page_num, + text=page_text[:max_chars] + ) + + try: + logger.info(f"đŸ€– Processing page {page_num} with GPT-4...") + + response = self.openai_client.chat.completions.create( + model=AZURE_DEPLOYMENT, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.1, + max_tokens=4000, # todo + response_format={"type": "json_object"} + ) + + result = json.loads(response.choices[0].message.content) + valid_layers = [] + + for layer in result.get("layers", []): + try: + depth_start = float(str(layer.get("depth_start", 0)).replace(",", ".")) + depth_end = float(str(layer.get("depth_end", 0)).replace(",", ".")) + description = layer.get("description", "").strip() + + if (depth_start < depth_end and depth_end <= 100 and depth_start >= 0 and description): + valid_layers.append({ + "page": page_num, + "depth_start": depth_start, + "depth_end": depth_end, + "description": description + }) + except Exception as e: + logger.warning(f" Invalid layer data: {e}") + continue + logger.info(f" ✓ Found {len(valid_layers)} valid layers on page {page_num}") + return valid_layers + + except Exception as e: + logger.error(f"❌ GPT extraction failed for page {page_num}: {e}") + return [] + + def merge_duplicate_layers(self, layers: List[Dict]) -> List[Dict]: + """Merge duplicate layers across pages""" + if not layers: + return [] + + layers.sort(key=lambda x: (x['depth_start'], x['page'])) + + merged = [] + current = None + + for layer in layers: + if current is None: + current = layer.copy() + else: + if (abs(current['depth_start'] - layer['depth_start']) < 0.01 and + abs(current['depth_end'] - layer['depth_end']) < 0.01): + if len(layer['description']) > len(current['description']): + current['description'] = layer['description'] + current['page'] = min(current['page'], layer['page']) + else: + merged.append(current) + current = layer.copy() + + if current: + merged.append(current) + + return merged + + def extract_layers_from_pdf(self, pdf_path: Path) -> Dict: + """Main extraction function""" + logger.info(f"\n🚀 Starting extraction for: {pdf_path.name}") + start_time = datetime.now() + + pages_text = self.extract_text_from_pdf(pdf_path) + + if not pages_text: + logger.error("❌ No text extracted from PDF") + return { + "metadata": { + "source_file": str(pdf_path.resolve()), + "extraction_date": datetime.now().isoformat(), + "extraction_method": "Azure Document Intelligence + GPT-4", + "pages": 0, + "layer_count": 0, + "error": "No text extracted from PDF" + }, + "layers": [] + } + + system_prompt, user_prompt_template = self.get_prompts() + + all_layers = [] + for page_idx, page_text in enumerate(pages_text): + page_layers = self.extract_layers_from_page( + page_text, page_idx + 1, system_prompt, user_prompt_template + ) + all_layers.extend(page_layers) + + logger.info(f"\n🔄 Merging duplicate layers...") + merged_layers = self.merge_duplicate_layers(all_layers) + + extraction_time = (datetime.now() - start_time).total_seconds() + + logger.info(f"\n✅ Extraction completed!") + logger.info(f" Total layers found: {len(merged_layers)}") + logger.info(f" Extraction time: {extraction_time:.1f} seconds") + + return { + "metadata": { + "source_file": str(pdf_path.resolve()), + "extraction_date": datetime.now().isoformat(), + "extraction_method": "Azure Document Intelligence + GPT-4", + "pages": len(pages_text), + "layer_count": len(merged_layers), + "extraction_time_seconds": round(extraction_time, 1) + }, + "layers": merged_layers + } + +@click.command() +@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path)) +@click.option('--output', '-o', type=click.Path(path_type=Path), help='Output JSON file path (default: same name as PDF)') +@click.option('--pretty', is_flag=True, help='Pretty print the JSON output') +def main(pdf_path: Path, output: Path, pretty: bool): + """Extract geological layers from a single PDF file.""" + click.echo("đŸȘš Geological Layer Extractor") + click.echo("=" * 40) + + extractor = SinglePDFExtractor() + result = extractor.extract_layers_from_pdf(pdf_path) + + if output is None: + output_dir = Path('output') + output_dir.mkdir(exist_ok=True) + output = output_dir / pdf_path.with_suffix('.json').name + with open(output, 'w', encoding='utf-8') as f: + if pretty: + json.dump(result, f, indent=2, ensure_ascii=False) + else: + json.dump(result, f, ensure_ascii=False) + + click.echo(f"\nđŸ’Ÿ Results saved to: {output}") + click.echo("\n📊 Extraction Summary:") + click.echo(f" Source: {pdf_path.name}") + click.echo(f" Pages processed: {result['metadata']['pages']}") + click.echo(f" Layers extracted: {result['metadata']['layer_count']}") + click.echo(f" Time taken: {result['metadata']['extraction_time_seconds']}s") + + if result['layers']: + click.echo("\n📋 Preview of extracted layers:") + for i, layer in enumerate(result['layers'][:3], 1): + click.echo(f"\n {i}. {layer['depth_start']:.1f} - {layer['depth_end']:.1f}m") + click.echo(f" {layer['description'][:80]}{'...' if len(layer['description']) > 80 else ''}") + if len(result['layers']) > 3: + click.echo(f"\n ... and {len(result['layers']) - 3} more layers") + +if __name__ == '__main__': + main() diff --git a/output_man/1_layers.json b/output_man/1_layers.json new file mode 100644 index 0000000..4b93c48 --- /dev/null +++ b/output_man/1_layers.json @@ -0,0 +1,72 @@ +{ + "metadata": { + "source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/1.pdf", + "extraction_date": "2025-07-31T13:06:45.311638", + "extraction_method": "Azure Document Intelligence + GPT-3.5", + "pages": 2, + "layer_count": 8, + "extraction_time_seconds": 14.9 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 0.2, + "description": "QuartĂ€r Schluff, sehr schwach tonig, schwach feinsandig, schwach organisch, braun, steif" + }, + { + "page": 1, + "depth_start": 0.2, + "depth_end": 0.8, + "description": "QuartĂ€r Schluff, schwach feinsandig, schwach tonig, hellbraun, dunkelbraun, steif" + }, + { + "page": 1, + "depth_start": 0.8, + "depth_end": 3.2, + "description": "QuartĂ€r Sand, stark kiesig, schwach steinig, schwach schluffig, grau" + }, + { + "page": 1, + "depth_start": 3.2, + "depth_end": 6.4, + "description": "Sand, schluffig, schwach feinkiesig, hellbraun, ockerbraun, graubraun, Metamagmatite" + }, + { + "page": 1, + "depth_start": 6.4, + "depth_end": 10.0, + "description": "Plutonit, hellgrau, rot, Metamagmatite, Granit-Zersatz," + }, + { + "page": 2, + "depth_start": 0.0, + "depth_end": 0.2, + "description": "QuartĂ€r Schluff, sehr schwach tonig, schwach feinsandig, schwach organisch, braun, steif" + }, + { + "page": 2, + "depth_start": 0.2, + "depth_end": 0.8, + "description": "QuartĂ€r Schluff, schwach feinsandig, schwach tonig, hellbraun, dunkelbraun, steif" + }, + { + "page": 2, + "depth_start": 0.8, + "depth_end": 3.2, + "description": "QuartĂ€r Sand, stark kiesig, schwach steinig, schwach schluffig, grau" + }, + { + "page": 2, + "depth_start": 3.2, + "depth_end": 6.4, + "description": "Sand, schluffig, schwach feinkiesig, hellbraun, ockerbraun, graubraun, Metamagmatite" + }, + { + "page": 2, + "depth_start": 6.4, + "depth_end": 10.0, + "description": "Plutonit, hellgrau, rot, Metamagmatite, Granit-Zersatz," + } + ] +} \ No newline at end of file diff --git a/output_man/2_layers.json b/output_man/2_layers.json new file mode 100644 index 0000000..33b1584 --- /dev/null +++ b/output_man/2_layers.json @@ -0,0 +1,30 @@ +{ + "metadata": { + "source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/2.pdf", + "extraction_date": "2025-07-31T13:03:32.291628", + "extraction_method": "Azure Document Intelligence + GPT-3.5", + "pages": 1, + "layer_count": 3, + "extraction_time_seconds": 8.67 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Mutterboden, Sand, schluffig, schwach tonig, schwach kiesig, schwach humos, erdfeucht, dunkelbraun, kalkfrei, g = kantig" + }, + { + "page": 1, + "depth_start": 0.3, + "depth_end": 4.0, + "description": "Sand, schluffig bis stark schluffig, schwach kiesig bis kiesig, halbfest, erdfeucht bis feucht, braun, graubraun, hellbraun, kalkfrei, vollstĂ€ndig verwittert bis zersetzt, Gneis-Zersatz, mĂŒrbe, g = kantig" + }, + { + "page": 1, + "depth_start": 4.0, + "depth_end": 10.0, + "description": "massige Metamorphite, erdfeucht, graubraun, rotfahl, kalkfrei, mĂ€ĂŸig verwittert bis stark verwittert, R3-R4, Gneis, geklĂŒftet bis stak geklĂŒftet, z.T. zerbohrt" + } + ] +} \ No newline at end of file diff --git a/output_man/3_layers.json b/output_man/3_layers.json new file mode 100644 index 0000000..9d783aa --- /dev/null +++ b/output_man/3_layers.json @@ -0,0 +1,30 @@ +{ + "metadata": { + "source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/3.pdf", + "extraction_date": "2025-07-31T13:06:23.295594", + "extraction_method": "Azure Document Intelligence + GPT-3.5", + "pages": 2, + "layer_count": 3, + "extraction_time_seconds": 10.99 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 0.2, + "description": "Terre vĂ©gĂ©tale" + }, + { + "page": 1, + "depth_start": 0.2, + "depth_end": 1.4, + "description": "Limon marron ocre orange" + }, + { + "page": 1, + "depth_start": 1.4, + "depth_end": 3.0, + "description": "Argile marron avec passage grisĂątre" + } + ] +} \ No newline at end of file diff --git a/output_man/4_layers.json b/output_man/4_layers.json new file mode 100644 index 0000000..0bed45e --- /dev/null +++ b/output_man/4_layers.json @@ -0,0 +1,72 @@ +{ + "metadata": { + "source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/2.pdf", + "extraction_date": "2025-07-31T13:03:32.291628", + "extraction_method": "Azure Document Intelligence + GPT-3.5", + "pages": 1, + "layer_count": 3, + "extraction_time_seconds": 8.67 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 2.8, + "description": "Top soil: Soft, beige to grey, silty CLAY to clayey SILT. Containing root of plants and fragments of shells." + }, + { + "page": 1, + "depth_start": 2.8, + "depth_end": 10.0, + "description": "Loose to medium dense, green grey, fine SAND. Containing gravel, fragments of shells a little organic matter." + }, + { + "page": 2, + "depth_start": 10.0, + "depth_end": 20.0, + "description": "Soft, green grey, clayey SILT to silty CLAY. Containing fragments of shells and organic matter." + }, + { + "page": 3, + "depth_start": 20.0, + "depth_end": 24.0, + "description": "Soft, green grey, silty CLAY. Containing fragments of shells and organic matter." + }, + { + "page": 3, + "depth_start": 20.0, + "depth_end": 28.6, + "description": "Soft to firm, green grey, silty CLAY. Containing fragments of shells and organic matter." + }, + { + "page": 3, + "depth_start": 28.6, + "depth_end": 34.0, + "description": "Loose to medium dense, green grey, fine SAND. Containing fragments of shells and a little organic matter." + }, + { + "page": 4, + "depth_start": 34.0, + "depth_end": 47.3, + "description": "Soft to firm, green grey, silty CLAY. Containg bed of silty sand, fragments of shells and a little organic matter." + }, + { + "page": 5, + "depth_start": 47.3, + "depth_end": 51.5, + "description": "Frim to stiff, green grey, silty CLAY. Containing gravel and fragments of shells." + }, + { + "page": 6, + "depth_start": 51.5, + "depth_end": 57.5, + "description": "Medium dense to dense, green grey, fine SAND. Containing fragments of shells and a little organic matter." + }, + { + "page": 6, + "depth_start": 57.5, + "depth_end": 70.0, + "description": "Firm, green grey, silty CLAY. Containing beds of silty sand, fragments of shells and a little organic matter." + } + ] +} \ No newline at end of file diff --git a/output_man/5_layers.json b/output_man/5_layers.json new file mode 100644 index 0000000..26c3347 --- /dev/null +++ b/output_man/5_layers.json @@ -0,0 +1,36 @@ +{ + "metadata": { + "source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/5.pdf", + "extraction_date": "2025-07-30T18:48:56.422438", + "extraction_method": "Azure Document Intelligence + GPT-3.5", + "pages": 2, + "layer_count": 9, + "extraction_time_seconds": 16.26 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 0.1, + "description": "Terre vĂ©gĂ©tale" + }, + { + "page": 1, + "depth_start": 0.1, + "depth_end": 0.6, + "description": "Limon + argileux brun avec quelques silex et + de matiĂšres organiques." + }, + { + "page": 1, + "depth_start": 0.6, + "depth_end": 2.0, + "description": "Argile Ă  silex TrĂšs charpentĂ©e, marron. Proportion de silex : ~ 80%." + }, + { + "page": 1, + "depth_start": 2.0, + "depth_end": 3.0, + "description": "Craie + argileuse TrĂšs altĂ©rĂ©e, blanchĂątre avec silex. Proportion de silex : > 50%." + } + ] +} \ No newline at end of file diff --git a/output_man/6_layers.json b/output_man/6_layers.json new file mode 100644 index 0000000..738ddee --- /dev/null +++ b/output_man/6_layers.json @@ -0,0 +1,132 @@ +{ + "metadata": { + "source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/5.pdf", + "extraction_date": "2025-07-30T18:48:56.422438", + "extraction_method": "Azure Document Intelligence + GPT-3.5", + "pages": 2, + "layer_count": 9, + "extraction_time_seconds": 16.26 + }, + "layers": [ + { + "page": 1, + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Silty CLAY with some rootlets; brown. Firm, moist, high plasticity (TOPSOIL)." + }, + { + "page": 1, + "depth_start": 0.3, + "depth_end": 1.5, + "description": "Silty CLAY; light brown mottled orange. Firm, moist, high plasticity." + }, + { + "page": 1, + "depth_start": 1.5, + "depth_end": 2.9, + "description": "Sandy SILT with some carbonaceous intrusions; light grey streaked orange. Stiff to very stiff, moist, low plasticity; sand, fine to coarse (Completely degraded NORTHLAND ALLOCHTHON GRADE 5)." + }, + { + "page": 1, + "depth_start": 2.9, + "depth_end": 3.0, + "description": "Highly degraded, light grey streaked orange, massive LIMESTONE; extremely weak, recovered as Sandy SILT; Very stiff, moist, low plasticity (NORTHLAND ALLOCHTHON GRADE 4)." + }, + { + "page": 1, + "depth_start": 3.0, + "depth_end": 5.5, + "description": "Moderately degraded, light bluish grey, massive LIMESTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 3)." + }, + { + "page": 1, + "depth_start": 5.5, + "depth_end": 5.8, + "description": "Highly degraded, light bluish grey, massive LIMESTONE; extremely weak, recovered as Sandy SILT; Very stiff, moist, low plasticity (NORTHLAND ALLOCHTHON GRADE 4)." + }, + { + "page": 1, + "depth_start": 5.8, + "depth_end": 6.39, + "description": "Moderately degraded, light bluish grey, massive LIMESTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 3)." + }, + { + "page": 1, + "depth_start": 6.39, + "depth_end": 7.05, + "description": "Highly degraded, light bluish grey, massive LIMESTONE; extremely weak, recovered as Silty fine to coarse SAND with some minor fine gravel; Medium Dense to Dense, wet, poorly graded (NORTHLAND ALLOCHTHON GRADE 4)." + }, + { + "page": 1, + "depth_start": 7.05, + "depth_end": 7.8, + "description": "Highly degraded, light grey, massive LIMESTONE; extremely weak, recovered as Sandy SILT; Very stiff, moist, low plasticity (NORTHLAND ALLOCHTHON GRADE 4)." + }, + { + "page": 1, + "depth_start": 7.8, + "depth_end": 9.7, + "description": "Moderately degraded, grey, thinly bedded MUDSTONE; extremely weak to very weak; bedding is moderately inclined. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 3)." + }, + { + "page": 2, + "depth_start": 9.7, + "depth_end": 11.7, + "description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 2, + "depth_start": 11.7, + "depth_end": 15.8, + "description": "Slightly weathered, grey, moderately thickly bedded, fine SANDSTONE; weak with thinly interbedded dark grey SILTSTONE; very weak. Bedding is steeply inclined." + }, + { + "page": 2, + "depth_start": 15.8, + "depth_end": 17.4, + "description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak. Highly" + }, + { + "page": 3, + "depth_start": 17.4, + "depth_end": 18.0, + "description": "Slightly degraded, dark grey, massive MUDSTONE; weak with some fine to coarse gravel sized, subrounded limestone clasts. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 3, + "depth_start": 18.0, + "depth_end": 21.3, + "description": "Slightly degraded, interbedded, dark grey MUDSTONE and bluish grey LIMESTONE; extremely weak. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 3, + "depth_start": 21.3, + "depth_end": 22.75, + "description": "Slighty degraded, light grey MUDSTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 3, + "depth_start": 22.75, + "depth_end": 23.4, + "description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 3, + "depth_start": 23.4, + "depth_end": 23.7, + "description": "Slighty degraded, light grey MUDSTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 3, + "depth_start": 23.7, + "depth_end": 24.45, + "description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + }, + { + "page": 4, + "depth_start": 24.45, + "depth_end": 30.0, + "description": "Slighty degraded, light grey MUDSTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided (NORTHLAND ALLOCHTHON GRADE 2)." + } + ] +} \ No newline at end of file diff --git a/prompt_optimizer.py b/prompt_optimizer.py new file mode 100644 index 0000000..5a1e0e1 --- /dev/null +++ b/prompt_optimizer.py @@ -0,0 +1,475 @@ +#!/usr/bin/env python3 +""" +Simple Prompt Tester - Test current prompt without optimization +""" +import click +import json +import logging +import json +import PyPDF2 +import os + +from dotenv import load_dotenv +from pathlib import Path +from typing import List, Dict, Tuple +from datetime import datetime +from difflib import SequenceMatcher +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from openai import AzureOpenAI + +load_dotenv() + +# Configuration Azure Document Intelligence +AZURE_DOC_ENDPOINT = os.getenv("AZURE_DOC_ENDPOINT") +AZURE_DOC_KEY = os.getenv("AZURE_DOC_KEY") + +# Configuration Azure OpenAI +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") +AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") +AZURE_DEPLOYMENT = os.getenv("AZURE_DEPLOYMENT", "gpt-4.1") +API_VERSION = os.getenv("API_VERSION", "2024-12-01-preview") + +if not all([AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY]): + raise ValueError( + "❌ Missing Azure credentials! Please check your .env file.\n" + "Required variables: AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY" + ) + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +logger.info("✅ Environment variables loaded successfully") +logger.info(f"📍 Document Intelligence endpoint: {AZURE_DOC_ENDPOINT}") +logger.info(f"đŸ€– OpenAI deployment: {AZURE_DEPLOYMENT}") + +class PromptTester: + """Test a single prompt on PDFs""" + def __init__(self): + self.doc_client = DocumentAnalysisClient( + endpoint=AZURE_DOC_ENDPOINT, + credential=AzureKeyCredential(AZURE_DOC_KEY) + ) + self.openai_client = AzureOpenAI( + azure_endpoint=AZURE_OPENAI_ENDPOINT, + api_key=AZURE_OPENAI_KEY, + api_version=API_VERSION + ) + self.ocr_cache = {} + + def get_current_prompt(self) -> Tuple[str, str]: + """Get the current working prompt""" + system_prompt = """Tu es un expert gĂ©otechnicien spĂ©cialisĂ© dans l'analyse de logs de forage. +Ta tĂąche est d'extraire UNIQUEMENT les couches gĂ©ologiques avec leurs profondeurs et descriptions COMPLÈTES. + +RÈGLES CRITIQUES: +1. Chaque couche DOIT avoir une profondeur de dĂ©but et de fin en mĂštres +2. La description doit contenir TOUT le texte associĂ© Ă  la couche (mĂȘme sur plusieurs lignes) +3. Notation dĂ©cimale: 0,3 ou 0.3 = 0.3m (PAS 3m!) +4. Profondeurs typiques: 0-0.3m, 0.3-1m, 1-4m, 4-10m, etc. +5. NE PAS sĂ©parer ou classifier le matĂ©riau - tout dans la description +6. IGNORER les mots isolĂ©s qui ne sont pas dans la colonne description + +RÈGLE DE CONTINUITÉ DES COUCHES: +⚠ Une couche gĂ©ologique est CONTINUE entre deux profondeurs +⚠ Si tu vois une profondeur isolĂ©e suivie de description, cherche OBLIGATOIREMENT la profondeur de fin +⚠ Les profondeurs de fin peuvent ĂȘtre: + - Sur la mĂȘme ligne aprĂšs la description + - Sur une ligne suivante + - Au dĂ©but de la couche suivante (la fin d'une couche = dĂ©but de la suivante) +⚠ NE JAMAIS fragmenter une description continue en plusieurs couches + +GESTION DES PAGES: +⚠ Les couches peuvent continuer d'une page Ă  l'autre +⚠ Sur une nouvelle page, si la premiĂšre profondeur n'est pas 0.0m, c'est la continuation de la page prĂ©cĂ©dente +⚠ Ne pas dupliquer les couches identiques sur plusieurs pages + +SORTIE: JSON valide uniquement""" + + user_prompt_template = """Extrait toutes les couches gĂ©ologiques de cette page de log de forage. + +CE QUI EST UNE COUCHE GÉOLOGIQUE: +✓ Descriptions de matĂ©riaux: sol, roche, sable, argile, limon, remblai, terre vĂ©gĂ©tale, etc. +✓ A une plage de profondeur ET des propriĂ©tĂ©s de matĂ©riau +✓ DĂ©crit les caractĂ©ristiques physiques: couleur, consistance, humiditĂ©, altĂ©ration, granulomĂ©trie + +CE QUI N'EST PAS UNE COUCHE (IGNORER): +✗ Descriptions d'images ou lĂ©gendes +✗ MĂ©tadonnĂ©es du projet (coordonnĂ©es, dates, noms d'entreprise) +✗ Descriptions d'Ă©quipement ou de mĂ©thodologie +✗ DonnĂ©es administratives ou commentaires sur la procĂ©dure +✗ Mots isolĂ©s dans d'autres colonnes (classifications, formations gĂ©ologiques, etc.) + +EXCLUSIONS ABSOLUES - NE JAMAIS extraire: +✗ Annotations techniques isolĂ©es: CZ, JT, JTx5, etc. +✗ Mesures techniques sans description de matĂ©riau +✗ Échantillons ("Undisturbed sample", "Disturbed sample", "sample", "Ă©chantillon") +✗ Lignes contenant uniquement des angles, pressions ou mesures +✗ RĂ©fĂ©rences Ă  des essais ou tests sans description de sol + +ALGORITHME DE DÉTECTION DES PROFONDEURS: +1. Pour chaque nombre qui ressemble Ă  une profondeur (X.X m ou X,X m): + - Si suivi de "-" ou "Ă " ou "bis" → profondeur de dĂ©but, chercher la fin + - Si prĂ©cĂ©dĂ© de "-" ou "Ă " ou "bis" → profondeur de fin + - Si isolĂ© → vĂ©rifier le contexte: + * DĂ©but de description → profondeur de dĂ©but + * Fin de description → profondeur de fin + * Nouvelle ligne → dĂ©but de nouvelle couche +2. TOUJOURS vĂ©rifier la cohĂ©rence: fin couche N = dĂ©but couche N+1 +3. Si une couche n'a pas de profondeur de fin explicite, elle va jusqu'au dĂ©but de la couche suivante + +MOTIFS À RECHERCHER: +- "0.00-0.30m: Terre vĂ©gĂ©tale, brun foncĂ©..." +- "0,3 Ă  1,0m Sable fin beige..." +- "de 1m Ă  4m: Argile plastique grise..." +- "1.00 - 4.00 ARGILE limoneuse..." +- "0.3 m Description... 4.0 m" (profondeurs sĂ©parĂ©es) +- Tableaux avec colonnes profondeur/description + +IDENTIFICATION DES COLONNES DE TABLEAU: +⚠ Dans un tableau, identifier VISUELLEMENT les colonnes avant d'extraire +⚠ Colonnes Ă  IGNORER complĂštement: + - Formation gĂ©ologique (Quaternaire, Tertiaire, Quaternaire, etc.) + - Stratigraphie + - Classification (Colluvions, Alluvions, etc.) + - Symboles ou codes isolĂ©s + - Âge gĂ©ologique +⚠ Colonnes Ă  UTILISER: + - Description du matĂ©riau + - PropriĂ©tĂ©s physiques (couleur, texture, consistance) +⚠ Si doute sur une colonne, vĂ©rifier si elle contient des propriĂ©tĂ©s physiques dĂ©taillĂ©es + +RÈGLE CRITIQUE DE SÉPARATION: +⚠ Les profondeurs (0-0.3m, 0.30m, etc.) ne font JAMAIS partie de la description +⚠ Si une ligne contient "0,30 m - Mutterboden, braun", alors: + - depth_start: 0.0, depth_end: 0.3 + - description: "Mutterboden, braun" (SANS les profondeurs) +⚠ Retirer TOUS les prĂ©fixes de profondeur du champ description +⚠ NE PAS inclure les profondeurs rĂ©pĂ©tĂ©es Ă  la fin de la description + +IMPORTANT: +- Capture TOUT le texte de description, mĂȘme s'il est long ou sur plusieurs lignes +- Ne pas rĂ©sumer ou reformuler - garder le texte original +- Inclure TOUTES les propriĂ©tĂ©s mentionnĂ©es (couleur, texture, humiditĂ©, inclusions, etc.) +- EXCLURE ABSOLUMENT les profondeurs du champ description +- Si le texte commence par "0,30 m - Mutterboden", la description est SEULEMENT "Mutterboden" +- SĂ©parer clairement: profondeurs dans depth_start/depth_end, matĂ©riau et propriĂ©tĂ©s dans description +- NE PAS mĂ©langer les colonnes d'un tableau - prendre uniquement la colonne description principale + +VALIDATION OBLIGATOIRE avant de retourner le JSON: +□ Toutes les couches ont une profondeur de dĂ©but ET de fin +□ Les profondeurs sont continues (pas de trous entre les couches) +□ Les profondeurs sont cohĂ©rentes (dĂ©but < fin) +□ Les descriptions ne contiennent PAS les profondeurs +□ Les descriptions ne contiennent PAS de noms de colonnes de classification +□ Aucune annotation technique isolĂ©e n'est prĂ©sente +□ Pas de duplication de couches identiques + +Retourne ce JSON: +{{ + "layers": [ + {{ + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Texte de description SANS les profondeurs - uniquement matĂ©riau et propriĂ©tĂ©s" + }} + ] +}} + +EXEMPLE CONCRET: +Texte original: "0,00 - 0,30 m Mutterboden, dunkelbraun, humos" +RĂ©sultat attendu: +{{ + "depth_start": 0.0, + "depth_end": 0.3, + "description": "Mutterboden, dunkelbraun, humos" +}} +PAS: "description": "0,00 - 0,30 m Mutterboden, dunkelbraun, humos" ❌ + +TEXTE DE LA PAGE {page_num}: +{text}""" + + return system_prompt, user_prompt_template + + def extract_text_from_pdf(self, pdf_path: Path) -> List[str]: + """Extract text from PDF with chunking""" + logger.info(f"🔍 Extracting text from {pdf_path.name}") + + total_pages = self.get_page_count(pdf_path) + logger.info(f"📄 PDF has {total_pages} pages") + + if total_pages == 0: + logger.error("Could not determine page count") + return [] + + # Extraire par chunks de 2 pages + pages_text = [] + chunk_size = 2 + for start_page in range(1, total_pages + 1, chunk_size): + end_page = min(start_page + chunk_size - 1, total_pages) + with open(pdf_path, "rb") as f: + poller = self.doc_client.begin_analyze_document( + model_id="prebuilt-document", + document=f, + pages=f"{start_page}-{end_page}" + ) + result = poller.result() + for page in result.pages: + page_lines = [line.content for line in (page.lines or [])] + pages_text.append('\n'.join(page_lines)) + + return pages_text + + def get_page_count(self, pdf_path: Path) -> int: + """Get the number of pages in a PDF""" + try: + with open(pdf_path, "rb") as f: + pdf_reader = PyPDF2.PdfReader(f) + return len(pdf_reader.pages) + except: + pass + return 0 + + def _extract_table_text(self, table) -> str: + """Extract text from table""" + if not hasattr(table, 'cells') or not table.cells: + return "" + + max_row = max(cell.row_index for cell in table.cells) + max_col = max(cell.column_index for cell in table.cells) + table_array = [["" for _ in range(max_col + 1)] for _ in range(max_row + 1)] + + for cell in table.cells: + if hasattr(cell, 'content'): + table_array[cell.row_index][cell.column_index] = cell.content + + lines = [] + for row in table_array: + row_text = " | ".join(str(cell).strip() for cell in row) + if row_text.strip(): + lines.append(row_text) + return "\n".join(lines) + + def test_prompt_on_pdf(self, system_prompt: str, user_prompt_template: str, pdf_path: Path) -> List[Dict]: + """Test prompt on a single PDF""" + pages_text = self.extract_text_from_pdf(pdf_path) + all_layers = [] + + for page_idx, page_text in enumerate(pages_text): + if not page_text.strip(): + continue + + user_prompt = user_prompt_template.format( + page_num=page_idx + 1, + text=page_text[:4000] + ) + + try: + response = self.openai_client.chat.completions.create( + model=AZURE_DEPLOYMENT, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.1, + max_tokens=3000, + response_format={"type": "json_object"} + ) + + result = json.loads(response.choices[0].message.content) + + for layer in result.get("layers", []): + try: + depth_start = float(str(layer.get("depth_start", 0)).replace(",", ".")) + depth_end = float(str(layer.get("depth_end", 0)).replace(",", ".")) + description = layer.get("description", "").strip() + + if (depth_start < depth_end and depth_end <= 100 and + depth_start >= 0 and description): + all_layers.append({ + "page": page_idx + 1, + "depth_start": depth_start, + "depth_end": depth_end, + "description": description + }) + except: + continue + + except Exception as e: + logger.error(f"GPT extraction failed for page {page_idx + 1}: {e}") + + return sorted(all_layers, key=lambda x: (x['page'], x['depth_start'])) + + def compare_results(self, extracted: List[Dict], expected: List[Dict]) -> Dict: + """Compare extracted layers with expected results""" + exact_matches = 0 + depth_matches = 0 + + for exp in expected: + for ext in extracted: + if (ext['page'] == exp['page'] and + abs(ext['depth_start'] - exp['depth_start']) < 0.01 and + abs(ext['depth_end'] - exp['depth_end']) < 0.01): + depth_matches += 1 + + desc_sim = SequenceMatcher(None, + ext['description'].lower(), + exp['description'].lower() + ).ratio() + + if desc_sim == 1: + exact_matches += 1 + break + + if len(expected) > 0: + similarity = len([e for e in extracted if any( + abs(e['depth_start'] - x['depth_start']) < 0.5 and + abs(e['depth_end'] - x['depth_end']) < 0.5 and + e['page'] == x['page'] for x in expected + )]) / len(expected) + else: + similarity = 1.0 if len(extracted) == 0 else 0.0 + + return { + 'similarity': similarity, + 'exact_matches': exact_matches, + 'depth_matches': depth_matches, + 'extracted_count': len(extracted), + 'expected_count': len(expected) + } + + def test_all_pdfs(self, examples_dir: Path, expected_dir: Path): + """Test prompt on all PDFs""" + test_pairs = [] + for json_file in expected_dir.glob("*_layers.json"): + pdf_name = json_file.stem.replace("_layers", "") + ".pdf" + pdf_path = examples_dir / pdf_name + if pdf_path.exists(): + test_pairs.append((pdf_path, json_file)) + + if not test_pairs: + click.echo("❌ No valid PDF/JSON pairs found!") + return + + click.echo(f"\n📚 Found {len(test_pairs)} test PDFs") + + system_prompt, user_prompt_template = self.get_current_prompt() + + results = {} + total_similarity = 0 + + with click.progressbar(test_pairs, label='Testing PDFs') as pairs: + for pdf_path, json_path in pairs: + try: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + if isinstance(data, dict) and 'layers' in data: + expected_layers = data['layers'] + elif isinstance(data, list): + expected_layers = data + else: + logger.warning(f"Unknown format in {json_path}") + continue + + extracted_layers = self.test_prompt_on_pdf( + system_prompt, user_prompt_template, pdf_path + ) + + comparison = self.compare_results(extracted_layers, expected_layers) + results[pdf_path.name] = comparison + total_similarity += comparison['similarity'] + + single_result = { + "metadata": { + "source_file": str(pdf_path.resolve()), + "expected_file": str(json_path.resolve()), + "extraction_time": datetime.now().isoformat(), + "extracted_count": comparison['extracted_count'], + "expected_count": comparison['expected_count'] + }, + "layers": extracted_layers + } + + output_dir = Path("output") + output_dir.mkdir(parents=True, exist_ok=True) + output_file_path = output_dir / f"{pdf_path.stem}_test_result.json" + + with open(output_file_path, 'w', encoding='utf-8') as out_f: + json.dump(single_result, out_f, indent=2, ensure_ascii=False) + + click.echo(f"đŸ’Ÿ RĂ©sultat sauvegardĂ© : {output_file_path}") + + + except Exception as e: + logger.error(f"Error with {pdf_path.name}: {e}") + results[pdf_path.name] = { + 'similarity': 0, + 'error': str(e) + } + + click.echo("\n" + "="*60) + click.echo("📊 TEST RESULTS") + click.echo("="*60) + + for pdf_name, result in results.items(): + if 'error' in result: + click.echo(f"\n❌ {pdf_name}: ERROR - {result['error']}") + else: + click.echo(f"\n📄 {pdf_name}:") + click.echo(f" Similarity: {result['similarity']:.1%}") + click.echo(f" Extracted: {result['extracted_count']} layers") + click.echo(f" Expected: {result['expected_count']} layers") + click.echo(f" Exact matches: {result['exact_matches']}") + click.echo(f" Depth matches: {result['depth_matches']}") + + avg_similarity = total_similarity / len(test_pairs) if test_pairs else 0 + click.echo("\n" + "="*60) + click.echo(f"📈 OVERALL: {avg_similarity:.1%} average similarity") + click.echo("="*60) + + output_file = Path("output/test_results.json") + with open(output_file, 'w', encoding='utf-8') as f: + json.dump({ + 'timestamp': datetime.now().isoformat(), + 'average_similarity': avg_similarity, + 'results': results + }, f, indent=2) + + click.echo(f"\nđŸ’Ÿ Detailed results saved to: {output_file}") + +@click.command() +@click.option('--examples-dir', '-e', default='examples',help='Directory containing test PDFs') +@click.option('--expected-dir', '-x', default='output_man', help='Directory containing expected results') +@click.option('--pdf', '-p', help='Test a single PDF only') +def main(examples_dir, expected_dir, pdf): + """Test current prompt on PDFs without optimization""" + + click.echo("đŸ§Ș Simple Prompt Tester") + click.echo("=" * 30) + + tester = PromptTester() + + if pdf: + pdf_path = Path(pdf) + if not pdf_path.exists(): + raise click.ClickException(f"PDF not found: {pdf}") + + click.echo(f"\n📄 Testing single PDF: {pdf_path.name}") + + system_prompt, user_prompt_template = tester.get_current_prompt() + layers = tester.test_prompt_on_pdf(system_prompt, user_prompt_template, pdf_path) + + click.echo(f"\n✅ Extracted {len(layers)} layers:") + for i, layer in enumerate(layers, 1): + click.echo(f"\n{i}. Depth: {layer['depth_start']:.2f} - {layer['depth_end']:.2f}m") + click.echo(f" Page: {layer['page']}") + click.echo(f" Description: {layer['description']}") + else: + tester.test_all_pdfs(Path(examples_dir), Path(expected_dir)) + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..16e98a7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,36 @@ +# Azure services +azure-ai-formrecognizer==3.3.0 +azure-core>=1.29.0 +openai>=1.0.0 + +# PDF processing +pdf2image==1.17.0 +Pillow>=10.0.0 +pytesseract==0.3.13 # Fallback OCR +pdfplumber==0.10.3 # Enhanced table extraction +PyPDF2>=3.0.1 + +# CLI and utilities +click==8.1.7 +pathlib2==2.3.7.post1 + +# Data handling +numpy>=1.24.0 +python-dateutil>=2.8.2 + +# For better logging +colorlog>=6.7.0 + +# Development tools (optional) +pytest>=7.4.0 +black>=23.0.0 +flake8>=6.0.0 + +# Type hints +typing-extensions>=4.0.0 +dataclasses>=0.6 + +# For text processing +nltk>=3.8.1 +textstat>=0.7.3 +python-dotenv==1.0.0 \ No newline at end of file