chore: initialize project structure and configuration files
This commit is contained in:
9
.env-exemple
Normal file
9
.env-exemple
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Azure Document Intelligence
|
||||||
|
AZURE_DOC_ENDPOINT=
|
||||||
|
AZURE_DOC_KEY=
|
||||||
|
|
||||||
|
# Azure OpenAI
|
||||||
|
AZURE_OPENAI_ENDPOINT=
|
||||||
|
AZURE_OPENAI_KEY=
|
||||||
|
AZURE_DEPLOYMENT=
|
||||||
|
API_VERSION=
|
||||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
.env
|
||||||
164
Makefile
Normal file
164
Makefile
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
# Makefile for Geotechnical Layer Extractor
|
||||||
|
.PHONY: help setup single test-optimizer clean test info
|
||||||
|
|
||||||
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
|
# Variables
|
||||||
|
PYTHON := python3
|
||||||
|
VENV := venv
|
||||||
|
VENV_BIN := $(VENV)/bin
|
||||||
|
PIP := $(VENV_BIN)/pip
|
||||||
|
PYTHON_VENV := $(VENV_BIN)/python
|
||||||
|
|
||||||
|
# Optimizer variables
|
||||||
|
EXAMPLES_DIR ?= examples
|
||||||
|
EXPECTED_DIR ?= output_man
|
||||||
|
|
||||||
|
help:
|
||||||
|
@echo "🪨 Geotechnical Layer Extractor"
|
||||||
|
@echo "================================"
|
||||||
|
@echo ""
|
||||||
|
@echo "📦 Setup:"
|
||||||
|
@echo " make setup - Install all dependencies and create venv"
|
||||||
|
@echo ""
|
||||||
|
@echo "🚀 Running:"
|
||||||
|
@echo " make single PDF= - Extract layers from a single PDF"
|
||||||
|
@echo " make test-optimizer - Run prompt optimization on test PDFs"
|
||||||
|
@echo ""
|
||||||
|
@echo "🧹 Maintenance:"
|
||||||
|
@echo " make clean - Clean all generated files"
|
||||||
|
@echo " make test - Test Azure connections"
|
||||||
|
@echo " make info - Show system information"
|
||||||
|
@echo ""
|
||||||
|
@echo "Examples:"
|
||||||
|
@echo " make single PDF=document.pdf"
|
||||||
|
@echo " make single PDF=examples/sample.pdf"
|
||||||
|
@echo " make test-optimizer"
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
venv:
|
||||||
|
@echo "🐍 Creating virtual environment..."
|
||||||
|
@$(PYTHON) -m venv $(VENV)
|
||||||
|
@echo "✅ Virtual environment created!"
|
||||||
|
|
||||||
|
# Setup - Install all dependencies
|
||||||
|
setup: venv
|
||||||
|
@echo "📦 Installing dependencies..."
|
||||||
|
@$(PIP) install --upgrade pip
|
||||||
|
@$(PIP) install -r requirements.txt
|
||||||
|
@mkdir -p output
|
||||||
|
@mkdir -p output_man
|
||||||
|
@mkdir -p examples
|
||||||
|
@echo "✅ Setup complete!"
|
||||||
|
@echo ""
|
||||||
|
@echo "⚠️ Don't forget to:"
|
||||||
|
@echo "1. Add your Azure keys in the scripts"
|
||||||
|
@echo "2. Add test PDFs in examples/ folder"
|
||||||
|
@echo "3. Add expected results in output_man/ folder"
|
||||||
|
|
||||||
|
# Extract layers from a single PDF
|
||||||
|
single:
|
||||||
|
@if [ -z "$(PDF)" ]; then \
|
||||||
|
echo "❌ Please provide a PDF file: make single PDF=your_file.pdf"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@if [ ! -f "$(PDF)" ]; then \
|
||||||
|
echo "❌ File not found: $(PDF)"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@if [ ! -f "extract_single.py" ]; then \
|
||||||
|
echo "❌ extract_single.py not found!"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@echo "🚀 Extracting layers from: $(PDF)"
|
||||||
|
@$(PYTHON_VENV) extract_single.py "$(PDF)" --pretty
|
||||||
|
|
||||||
|
# Run prompt optimizer
|
||||||
|
test-optimizer:
|
||||||
|
@echo "🔬 Running Prompt Optimizer"
|
||||||
|
@echo "=============================="
|
||||||
|
@echo "📁 Examples directory: $(EXAMPLES_DIR)"
|
||||||
|
@echo "📁 Expected results: $(EXPECTED_DIR)"
|
||||||
|
@echo ""
|
||||||
|
@if [ ! -f "prompt_optimizer.py" ]; then \
|
||||||
|
echo "❌ prompt_optimizer.py not found!"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@if [ ! -d "$(EXAMPLES_DIR)" ]; then \
|
||||||
|
echo "❌ Examples directory not found: $(EXAMPLES_DIR)"; \
|
||||||
|
echo "💡 Create it with: mkdir -p $(EXAMPLES_DIR)"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@if [ ! -d "$(EXPECTED_DIR)" ]; then \
|
||||||
|
echo "❌ Expected results directory not found: $(EXPECTED_DIR)"; \
|
||||||
|
echo "💡 Create it with: mkdir -p $(EXPECTED_DIR)"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@echo "🚀 Starting optimization..."
|
||||||
|
@$(PYTHON_VENV) prompt_optimizer.py \
|
||||||
|
--examples-dir $(EXAMPLES_DIR) \
|
||||||
|
--expected-dir $(EXPECTED_DIR)
|
||||||
|
|
||||||
|
# Test Azure connections
|
||||||
|
test:
|
||||||
|
@echo "🧪 Testing Azure connections..."
|
||||||
|
@$(PYTHON_VENV) -c "from azure.ai.formrecognizer import DocumentAnalysisClient; \
|
||||||
|
from azure.core.credentials import AzureKeyCredential; \
|
||||||
|
from openai import AzureOpenAI; \
|
||||||
|
print('✅ Azure imports successful')"
|
||||||
|
@echo "✅ All imports working!"
|
||||||
|
|
||||||
|
# Clean all generated files
|
||||||
|
clean:
|
||||||
|
@echo "🧹 Cleaning all files..."
|
||||||
|
@rm -rf $(VENV)
|
||||||
|
@rm -rf __pycache__ */__pycache__
|
||||||
|
@rm -rf .pytest_cache
|
||||||
|
@rm -rf *.pyc */*.pyc
|
||||||
|
@rm -rf output/*.json output/*.txt output/*.html
|
||||||
|
@rm -rf *.log
|
||||||
|
@rm -rf test_results.json
|
||||||
|
@find . -type f -name ".DS_Store" -delete 2>/dev/null || true
|
||||||
|
@echo "✅ Clean complete!"
|
||||||
|
|
||||||
|
# Show system information
|
||||||
|
info:
|
||||||
|
@echo "📊 System Information:"
|
||||||
|
@echo "======================"
|
||||||
|
@echo "OS: $$(uname -s)"
|
||||||
|
@echo "Python: $$($$(which $(PYTHON)) --version)"
|
||||||
|
@if [ -d "$(VENV)" ]; then \
|
||||||
|
echo "Virtual env: ✅ $(VENV) exists"; \
|
||||||
|
echo "Pip version: $$($(PIP) --version 2>/dev/null || echo 'N/A')"; \
|
||||||
|
else \
|
||||||
|
echo "Virtual env: ❌ Not created (run 'make setup')"; \
|
||||||
|
fi
|
||||||
|
@echo ""
|
||||||
|
@echo "📁 Project structure:"
|
||||||
|
@if [ -f "extract_single.py" ]; then \
|
||||||
|
echo "✅ extract_single.py found"; \
|
||||||
|
else \
|
||||||
|
echo "❌ extract_single.py missing"; \
|
||||||
|
fi
|
||||||
|
@if [ -f "prompt_optimizer.py" ]; then \
|
||||||
|
echo "✅ prompt_optimizer.py found"; \
|
||||||
|
else \
|
||||||
|
echo "❌ prompt_optimizer.py missing"; \
|
||||||
|
fi
|
||||||
|
@echo ""
|
||||||
|
@echo "📁 Directories:"
|
||||||
|
@if [ -d "output" ]; then \
|
||||||
|
echo "✅ output/ exists ($$(ls -1 output/ 2>/dev/null | wc -l | tr -d ' ') files)"; \
|
||||||
|
else \
|
||||||
|
echo "❌ output/ not found"; \
|
||||||
|
fi
|
||||||
|
@if [ -d "$(EXAMPLES_DIR)" ]; then \
|
||||||
|
echo "✅ $(EXAMPLES_DIR)/ exists ($$(ls -1 $(EXAMPLES_DIR)/*.pdf 2>/dev/null | wc -l | tr -d ' ') PDFs)"; \
|
||||||
|
else \
|
||||||
|
echo "❌ $(EXAMPLES_DIR)/ not found"; \
|
||||||
|
fi
|
||||||
|
@if [ -d "$(EXPECTED_DIR)" ]; then \
|
||||||
|
echo "✅ $(EXPECTED_DIR)/ exists ($$(ls -1 $(EXPECTED_DIR)/*_layers.json 2>/dev/null | wc -l | tr -d ' ') JSON files)"; \
|
||||||
|
else \
|
||||||
|
echo "❌ $(EXPECTED_DIR)/ not found"; \
|
||||||
|
fi
|
||||||
330
README.md
330
README.md
@@ -0,0 +1,330 @@
|
|||||||
|
# 🪨 Geotechnical Layer Extractor
|
||||||
|
|
||||||
|
Un outil d'extraction automatique des couches géologiques à partir de logs de forage au format PDF, utilisant l'intelligence artificielle pour analyser et structurer les données géotechniques.
|
||||||
|
|
||||||
|
## 🎯 Objectif
|
||||||
|
|
||||||
|
Ce projet permet d'extraire automatiquement les informations de couches géologiques (profondeurs et descriptions) depuis des documents PDF de logs de forage, souvent complexes et mal structurés. Il utilise Azure Document Intelligence pour l'OCR et GPT-4 pour l'analyse sémantique du contenu.
|
||||||
|
|
||||||
|
## ✨ Fonctionnalités
|
||||||
|
|
||||||
|
- **Extraction OCR avancée** : Utilise Azure Document Intelligence pour extraire texte et tableaux
|
||||||
|
- **Analyse par IA** : GPT-4 analyse le contenu pour identifier les couches géologiques
|
||||||
|
- **Gestion multi-pages** : Traite correctement les documents de plusieurs pages
|
||||||
|
- **Détection de tableaux** : Identifie et parse les tableaux complexes
|
||||||
|
- **Déduplication intelligente** : Évite les doublons entre pages
|
||||||
|
- **Optimisation de prompts** : Outil pour améliorer automatiquement les prompts
|
||||||
|
- **Export JSON structuré** : Format de sortie standardisé et exploitable
|
||||||
|
|
||||||
|
## 📋 Prérequis
|
||||||
|
|
||||||
|
- Python 3.8 ou supérieur
|
||||||
|
- Compte Azure avec accès à :
|
||||||
|
- Azure Document Intelligence (Form Recognizer)
|
||||||
|
- Azure OpenAI avec accès GPT-4
|
||||||
|
|
||||||
|
## 🚀 Installation
|
||||||
|
|
||||||
|
### 1. Cloner le repository
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://git.linaster.online/vincent.morisseau/geotech.git
|
||||||
|
cd geotech
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Installer les dépendances
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make setup
|
||||||
|
```
|
||||||
|
|
||||||
|
Cela va :
|
||||||
|
- Créer un environnement virtuel Python
|
||||||
|
- Installer tous les packages nécessaires
|
||||||
|
- Créer les dossiers requis
|
||||||
|
|
||||||
|
### 3. Configuration des variables d'environnement
|
||||||
|
|
||||||
|
⚠️ **IMPORTANT** : Les clés API ne doivent jamais être commitées dans Git !
|
||||||
|
|
||||||
|
1. **Copier le fichier d'exemple** :
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Éditer le fichier `.env`** et remplir vos clés API Azure :
|
||||||
|
```bash
|
||||||
|
nano .env # ou utilisez votre éditeur préféré
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Remplir les variables suivantes** dans le fichier `.env` :
|
||||||
|
```env
|
||||||
|
# Azure Document Intelligence
|
||||||
|
AZURE_DOC_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
|
||||||
|
AZURE_DOC_KEY=your-document-intelligence-key-here
|
||||||
|
|
||||||
|
# Azure OpenAI
|
||||||
|
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
|
||||||
|
AZURE_OPENAI_KEY=your-openai-key-here
|
||||||
|
AZURE_DEPLOYMENT=gpt-4o-mini # ou gpt-4o, gpt-4, etc.
|
||||||
|
API_VERSION=2024-12-01-preview
|
||||||
|
```
|
||||||
|
|
||||||
|
⚠️ **Note de sécurité** : Le fichier `.env` est automatiquement ignoré par Git (via `.gitignore`) pour protéger vos clés API.
|
||||||
|
|
||||||
|
### 4. Obtenir les clés Azure
|
||||||
|
|
||||||
|
#### Azure Document Intelligence
|
||||||
|
|
||||||
|
1. **Créer une ressource dans le [portail Azure](https://portal.azure.com)**
|
||||||
|
- Home → Create a resource
|
||||||
|
- IA + Machine Learning → Document Intelligence (Form Recognizer)
|
||||||
|
- Choisir la région (ex: East US 2)
|
||||||
|
- Pricing tier : S0 (Standard)
|
||||||
|
|
||||||
|
2. **Récupérer les credentials**
|
||||||
|
- Aller dans votre ressource → Keys and Endpoint
|
||||||
|
- Copier `KEY 1` et `Endpoint`
|
||||||
|
- Coller ces valeurs dans votre fichier `.env`
|
||||||
|
|
||||||
|
#### Azure OpenAI
|
||||||
|
|
||||||
|
1. **Demander l'accès à Azure OpenAI** (si nécessaire)
|
||||||
|
- [Formulaire de demande d'accès](https://azure.microsoft.com/en-us/products/ai-services/openai-service)
|
||||||
|
- Délai : 24-48h généralement
|
||||||
|
|
||||||
|
2. **Créer une ressource Azure OpenAI**
|
||||||
|
- Home → Create a resource → Azure OpenAI
|
||||||
|
- Choisir la région (recommandé : Sweden Central ou East US 2)
|
||||||
|
|
||||||
|
3. **Déployer un modèle GPT**
|
||||||
|
- Accéder à Azure OpenAI Studio depuis votre ressource
|
||||||
|
- Deployments → Create new deployment
|
||||||
|
- Choisir votre modèle :
|
||||||
|
- `gpt-4o-mini` : Économique, 0,003$/page
|
||||||
|
- `gpt-4o` : Meilleur rapport qualité/prix, 0,042$/page
|
||||||
|
- `gpt-4` : Plus cher mais plus puissant, 0,21$/page
|
||||||
|
|
||||||
|
4. **Récupérer les credentials**
|
||||||
|
- Dans Azure OpenAI Studio → Deployments → View code
|
||||||
|
- Copier l'endpoint, la clé et le nom du déploiement
|
||||||
|
- Coller ces valeurs dans votre fichier `.env`
|
||||||
|
|
||||||
|
### 5. Vérifier la configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test de connexion aux services Azure
|
||||||
|
make test
|
||||||
|
|
||||||
|
# Ou vérifier manuellement
|
||||||
|
python -c "from dotenv import load_dotenv; import os; load_dotenv(); print('✅ OK' if os.getenv('AZURE_DOC_KEY') else '❌ .env non configuré')"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📖 Utilisation
|
||||||
|
|
||||||
|
### Extraction d'un seul PDF
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Extraction simple (sauvegarde dans output/document.json)
|
||||||
|
make single PDF=document.pdf
|
||||||
|
|
||||||
|
# Ou directement avec le script
|
||||||
|
python extract_single.py document.pdf
|
||||||
|
|
||||||
|
# Spécifier un fichier de sortie custom
|
||||||
|
python extract_single.py document.pdf -o results/custom.json
|
||||||
|
|
||||||
|
# Avec formatage JSON lisible
|
||||||
|
python extract_single.py document.pdf --pretty
|
||||||
|
```
|
||||||
|
|
||||||
|
### Optimisation des prompts
|
||||||
|
|
||||||
|
Pour améliorer la qualité d'extraction sur un ensemble de PDFs de test :
|
||||||
|
|
||||||
|
1. Placer les PDFs de test dans `examples/`
|
||||||
|
2. Placer les résultats attendus dans `output_man/` (format : `1_layers.json` pour `1.pdf`)
|
||||||
|
3. Lancer le test :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test-optimizer
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test des connexions Azure
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📁 Structure du projet
|
||||||
|
|
||||||
|
```
|
||||||
|
geotechnical-extractor/
|
||||||
|
├── .env.example # Fichier d'exemple pour les variables d'environnement
|
||||||
|
├── .env # VOS clés API (ignoré par Git)
|
||||||
|
├── .gitignore # Fichiers à ignorer (inclut .env)
|
||||||
|
├── extract_single.py # Script principal d'extraction
|
||||||
|
├── prompt_optimizer.py # Optimiseur de prompts
|
||||||
|
├── requirements.txt # Dépendances Python
|
||||||
|
├── Makefile # Commandes simplifiées
|
||||||
|
├── README.md # Ce fichier
|
||||||
|
├── examples/ # PDFs de test pour l'optimiseur
|
||||||
|
│ ├── 1.pdf
|
||||||
|
│ ├── 2.pdf
|
||||||
|
│ └── ...
|
||||||
|
├── output_man/ # Résultats attendus (vérité terrain)
|
||||||
|
│ ├── 1_layers.json
|
||||||
|
│ ├── 2_layers.json
|
||||||
|
│ └── ...
|
||||||
|
└── output/ # Résultats générés
|
||||||
|
├── document.json
|
||||||
|
└── ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📊 Format de sortie
|
||||||
|
|
||||||
|
Les résultats sont sauvegardés en JSON avec la structure suivante :
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/path/to/document.pdf",
|
||||||
|
"extraction_date": "2025-08-06T10:30:45.123456",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-4",
|
||||||
|
"pages": 6,
|
||||||
|
"layer_count": 12,
|
||||||
|
"extraction_time_seconds": 23.5
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Terre végétale, brun foncé, présence de racines"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.3,
|
||||||
|
"depth_end": 2.5,
|
||||||
|
"description": "Argile limoneuse, gris-beige, plastique, humide"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Description des champs
|
||||||
|
|
||||||
|
- **metadata** : Informations sur l'extraction
|
||||||
|
- `source_file` : Chemin absolu du PDF source
|
||||||
|
- `extraction_date` : Date et heure de l'extraction
|
||||||
|
- `pages` : Nombre de pages traitées
|
||||||
|
- `layer_count` : Nombre de couches extraites
|
||||||
|
- `extraction_time_seconds` : Durée de l'extraction
|
||||||
|
|
||||||
|
- **layers** : Liste des couches géologiques
|
||||||
|
- `page` : Numéro de page où apparaît la couche
|
||||||
|
- `depth_start` : Profondeur de début en mètres
|
||||||
|
- `depth_end` : Profondeur de fin en mètres
|
||||||
|
- `description` : Description complète du matériau
|
||||||
|
|
||||||
|
## 💰 Coûts estimés
|
||||||
|
|
||||||
|
Entre ~0,003€ et ~0,21€ par page selon le modèle.
|
||||||
|
|
||||||
|
**Recommandation** : Utilisez GPT-4o pour le meilleur rapport qualité/prix
|
||||||
|
|
||||||
|
## 🚨 Résolution des problèmes
|
||||||
|
|
||||||
|
### "Missing Azure credentials"
|
||||||
|
- Vérifier que le fichier `.env` existe
|
||||||
|
- Vérifier que toutes les variables sont remplies dans `.env`
|
||||||
|
- S'assurer d'avoir copié `.env.example` vers `.env`
|
||||||
|
|
||||||
|
### "Resource not found"
|
||||||
|
- Vérifier que l'endpoint Azure est correct (avec ou sans `/` final)
|
||||||
|
- S'assurer que la ressource est déployée et active
|
||||||
|
|
||||||
|
### "Invalid API key"
|
||||||
|
- Vérifier que vous utilisez la bonne clé
|
||||||
|
- Les clés Document Intelligence et OpenAI sont différentes
|
||||||
|
- Vérifier qu'il n'y a pas d'espaces dans les clés
|
||||||
|
|
||||||
|
### "Model not found"
|
||||||
|
- Vérifier le nom exact du déploiement GPT dans `.env`
|
||||||
|
- S'assurer que le modèle est déployé dans votre région
|
||||||
|
- Le nom est sensible à la casse
|
||||||
|
|
||||||
|
### Extraction incomplète
|
||||||
|
- Augmenter la limite de caractères dans le script
|
||||||
|
- Vérifier la qualité du PDF (scan lisible)
|
||||||
|
- Utiliser GPT-4o au lieu de GPT-4o-mini
|
||||||
|
|
||||||
|
### Pages manquantes
|
||||||
|
- Le script traite maintenant jusqu'à 8000 caractères par page
|
||||||
|
- Les tableaux sont marqués avec `[TABLE DATA]` pour une meilleure détection
|
||||||
|
|
||||||
|
## 🛠️ Personnalisation
|
||||||
|
|
||||||
|
### Modifier les prompts
|
||||||
|
|
||||||
|
Les prompts sont définis dans la méthode `get_prompts()` des scripts. Vous pouvez les adapter selon vos besoins spécifiques.
|
||||||
|
|
||||||
|
### Ajuster les limites
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Augmenter la limite de caractères par page
|
||||||
|
max_chars = 10000 # Au lieu de 8000
|
||||||
|
|
||||||
|
# Augmenter les tokens de réponse
|
||||||
|
max_tokens=6000 # Au lieu de 4000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Variables d'environnement supplémentaires
|
||||||
|
|
||||||
|
Vous pouvez ajouter dans votre `.env` :
|
||||||
|
|
||||||
|
```env
|
||||||
|
# Optionnel : niveau de log
|
||||||
|
LOG_LEVEL=DEBUG # ou INFO, WARNING, ERROR
|
||||||
|
|
||||||
|
# Optionnel : taille des chunks pour les PDFs volumineux
|
||||||
|
CHUNK_SIZE=2 # Nombre de pages par chunk
|
||||||
|
```
|
||||||
|
|
||||||
|
### Langues supportées
|
||||||
|
|
||||||
|
Les prompts actuels sont en français mais peuvent être traduits. Le système supporte tous les documents que Document Intelligence peut traiter.
|
||||||
|
|
||||||
|
## 🧹 Maintenance
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Nettoyer tous les fichiers générés
|
||||||
|
make clean
|
||||||
|
|
||||||
|
# Voir les informations système
|
||||||
|
make info
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📈 Performances
|
||||||
|
|
||||||
|
- **Temps moyen** : 3-5 secondes par page
|
||||||
|
- **Précision** : ~85-95% selon la complexité du document
|
||||||
|
- **Limite** : PDFs jusqu'à 500 pages (limite Azure)
|
||||||
|
|
||||||
|
## 🔒 Sécurité
|
||||||
|
|
||||||
|
- **Ne jamais committer le fichier `.env`** contenant vos clés API
|
||||||
|
- Le fichier `.env` est automatiquement ignoré par Git
|
||||||
|
- Utilisez `.env.example` comme template pour partager la structure
|
||||||
|
- Changez régulièrement vos clés API Azure
|
||||||
|
- Utilisez des variables d'environnement en production
|
||||||
|
|
||||||
|
## 📝 Licence
|
||||||
|
|
||||||
|
Ce projet est sous licence VCGI par Tom BLANCHETON.
|
||||||
|
|
||||||
|
## 🙏 Remerciements
|
||||||
|
|
||||||
|
- Azure Document Intelligence pour l'OCR de qualité
|
||||||
|
- OpenAI pour GPT-4
|
||||||
|
- La communauté géotechnique pour les retours et tests
|
||||||
|
|||||||
BIN
examples/1.pdf
Normal file
BIN
examples/1.pdf
Normal file
Binary file not shown.
BIN
examples/2.pdf
Normal file
BIN
examples/2.pdf
Normal file
Binary file not shown.
BIN
examples/3.pdf
Normal file
BIN
examples/3.pdf
Normal file
Binary file not shown.
BIN
examples/4.pdf
Normal file
BIN
examples/4.pdf
Normal file
Binary file not shown.
BIN
examples/5.pdf
Normal file
BIN
examples/5.pdf
Normal file
Binary file not shown.
BIN
examples/6.pdf
Normal file
BIN
examples/6.pdf
Normal file
Binary file not shown.
BIN
examples/7.pdf
Normal file
BIN
examples/7.pdf
Normal file
Binary file not shown.
BIN
examples/8.pdf
Normal file
BIN
examples/8.pdf
Normal file
Binary file not shown.
429
extract_single.py
Normal file
429
extract_single.py
Normal file
@@ -0,0 +1,429 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Single PDF Layer Extractor - Extract geological layers from a single PDF
|
||||||
|
"""
|
||||||
|
import click
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import PyPDF2
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
from azure.ai.formrecognizer import DocumentAnalysisClient
|
||||||
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
from openai import AzureOpenAI
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Configuration Azure Document Intelligence
|
||||||
|
AZURE_DOC_ENDPOINT = os.getenv("AZURE_DOC_ENDPOINT")
|
||||||
|
AZURE_DOC_KEY = os.getenv("AZURE_DOC_KEY")
|
||||||
|
|
||||||
|
# Configuration Azure OpenAI
|
||||||
|
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
||||||
|
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
||||||
|
AZURE_DEPLOYMENT = os.getenv("AZURE_DEPLOYMENT", "gpt-4.1")
|
||||||
|
API_VERSION = os.getenv("API_VERSION", "2024-12-01-preview")
|
||||||
|
|
||||||
|
if not all([AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY]):
|
||||||
|
raise ValueError(
|
||||||
|
"❌ Missing Azure credentials! Please check your .env file.\n"
|
||||||
|
"Required variables: AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configuration du logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
logger.info("✅ Environment variables loaded successfully")
|
||||||
|
logger.info(f"📍 Document Intelligence endpoint: {AZURE_DOC_ENDPOINT}")
|
||||||
|
logger.info(f"🤖 OpenAI deployment: {AZURE_DEPLOYMENT}")
|
||||||
|
|
||||||
|
class SinglePDFExtractor:
|
||||||
|
"""Extract geological layers from a single PDF"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize Azure clients"""
|
||||||
|
self.doc_client = DocumentAnalysisClient(
|
||||||
|
endpoint=AZURE_DOC_ENDPOINT,
|
||||||
|
credential=AzureKeyCredential(AZURE_DOC_KEY)
|
||||||
|
)
|
||||||
|
self.openai_client = AzureOpenAI(
|
||||||
|
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
||||||
|
api_key=AZURE_OPENAI_KEY,
|
||||||
|
api_version=API_VERSION
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_prompts(self) -> Tuple[str, str]:
|
||||||
|
"""Get the system and user prompts"""
|
||||||
|
system_prompt = """Tu es un expert géotechnicien spécialisé dans l'analyse de logs de forage.
|
||||||
|
Ta tâche est d'extraire UNIQUEMENT les couches géologiques avec leurs profondeurs et descriptions COMPLÈTES.
|
||||||
|
|
||||||
|
RÈGLES CRITIQUES:
|
||||||
|
1. Chaque couche DOIT avoir une profondeur de début et de fin en mètres
|
||||||
|
2. La description doit contenir TOUT le texte associé à la couche (même sur plusieurs lignes)
|
||||||
|
3. Notation décimale: 0,3 ou 0.3 = 0.3m (PAS 3m!)
|
||||||
|
4. Profondeurs typiques: 0-0.3m, 0.3-1m, 1-4m, 4-10m, etc.
|
||||||
|
5. NE PAS séparer ou classifier le matériau - tout dans la description
|
||||||
|
6. IGNORER les mots isolés qui ne sont pas dans la colonne description
|
||||||
|
|
||||||
|
RÈGLE DE CONTINUITÉ DES COUCHES:
|
||||||
|
⚠️ Une couche géologique est CONTINUE entre deux profondeurs
|
||||||
|
⚠️ Si tu vois une profondeur isolée suivie de description, cherche OBLIGATOIREMENT la profondeur de fin
|
||||||
|
⚠️ Les profondeurs de fin peuvent être:
|
||||||
|
- Sur la même ligne après la description
|
||||||
|
- Sur une ligne suivante
|
||||||
|
- Au début de la couche suivante (la fin d'une couche = début de la suivante)
|
||||||
|
⚠️ NE JAMAIS fragmenter une description continue en plusieurs couches
|
||||||
|
|
||||||
|
GESTION DES PAGES:
|
||||||
|
⚠️ Les couches peuvent continuer d'une page à l'autre
|
||||||
|
⚠️ Sur une nouvelle page, si la première profondeur n'est pas 0.0m, c'est la continuation de la page précédente
|
||||||
|
⚠️ Ne pas dupliquer les couches identiques sur plusieurs pages
|
||||||
|
|
||||||
|
SORTIE: JSON valide uniquement"""
|
||||||
|
|
||||||
|
user_prompt_template = """Extrait toutes les couches géologiques de cette page de log de forage.
|
||||||
|
|
||||||
|
CE QUI EST UNE COUCHE GÉOLOGIQUE:
|
||||||
|
✓ Descriptions de matériaux: sol, roche, sable, argile, limon, remblai, terre végétale, etc.
|
||||||
|
✓ A une plage de profondeur ET des propriétés de matériau
|
||||||
|
✓ Décrit les caractéristiques physiques: couleur, consistance, humidité, altération, granulométrie
|
||||||
|
|
||||||
|
CE QUI N'EST PAS UNE COUCHE (IGNORER):
|
||||||
|
✗ Descriptions d'images ou légendes
|
||||||
|
✗ Métadonnées du projet (coordonnées, dates, noms d'entreprise)
|
||||||
|
✗ Descriptions d'équipement ou de méthodologie
|
||||||
|
✗ Données administratives ou commentaires sur la procédure
|
||||||
|
✗ Mots isolés dans d'autres colonnes (classifications, formations géologiques, etc.)
|
||||||
|
|
||||||
|
EXCLUSIONS ABSOLUES - NE JAMAIS extraire:
|
||||||
|
✗ Annotations techniques isolées: CZ, JT, JTx5, etc.
|
||||||
|
✗ Mesures techniques sans description de matériau
|
||||||
|
✗ Échantillons ("Undisturbed sample", "Disturbed sample", "sample", "échantillon")
|
||||||
|
✗ Lignes contenant uniquement des angles, pressions ou mesures
|
||||||
|
✗ Références à des essais ou tests sans description de sol
|
||||||
|
|
||||||
|
ALGORITHME DE DÉTECTION DES PROFONDEURS:
|
||||||
|
1. Pour chaque nombre qui ressemble à une profondeur (X.X m ou X,X m):
|
||||||
|
- Si suivi de "-" ou "à" ou "bis" → profondeur de début, chercher la fin
|
||||||
|
- Si précédé de "-" ou "à" ou "bis" → profondeur de fin
|
||||||
|
- Si isolé → vérifier le contexte:
|
||||||
|
* Début de description → profondeur de début
|
||||||
|
* Fin de description → profondeur de fin
|
||||||
|
* Nouvelle ligne → début de nouvelle couche
|
||||||
|
2. TOUJOURS vérifier la cohérence: fin couche N = début couche N+1
|
||||||
|
3. Si une couche n'a pas de profondeur de fin explicite, elle va jusqu'au début de la couche suivante
|
||||||
|
|
||||||
|
MOTIFS À RECHERCHER:
|
||||||
|
- "0.00-0.30m: Terre végétale, brun foncé..."
|
||||||
|
- "0,3 à 1,0m Sable fin beige..."
|
||||||
|
- "de 1m à 4m: Argile plastique grise..."
|
||||||
|
- "1.00 - 4.00 ARGILE limoneuse..."
|
||||||
|
- "0.3 m Description... 4.0 m" (profondeurs séparées)
|
||||||
|
- Tableaux avec colonnes profondeur/description
|
||||||
|
|
||||||
|
IDENTIFICATION DES COLONNES DE TABLEAU:
|
||||||
|
⚠️ Dans un tableau, identifier VISUELLEMENT les colonnes avant d'extraire
|
||||||
|
⚠️ Colonnes à IGNORER complètement:
|
||||||
|
- Formation géologique (Quaternaire, Tertiaire, Quaternaire, etc.)
|
||||||
|
- Stratigraphie
|
||||||
|
- Classification (Colluvions, Alluvions, etc.)
|
||||||
|
- Symboles ou codes isolés
|
||||||
|
- Âge géologique
|
||||||
|
⚠️ Colonnes à UTILISER:
|
||||||
|
- Description du matériau
|
||||||
|
- Propriétés physiques (couleur, texture, consistance)
|
||||||
|
⚠️ Si doute sur une colonne, vérifier si elle contient des propriétés physiques détaillées
|
||||||
|
|
||||||
|
RÈGLE CRITIQUE DE SÉPARATION:
|
||||||
|
⚠️ Les profondeurs (0-0.3m, 0.30m, etc.) ne font JAMAIS partie de la description
|
||||||
|
⚠️ Si une ligne contient "0,30 m - Mutterboden, braun", alors:
|
||||||
|
- depth_start: 0.0, depth_end: 0.3
|
||||||
|
- description: "Mutterboden, braun" (SANS les profondeurs)
|
||||||
|
⚠️ Retirer TOUS les préfixes de profondeur du champ description
|
||||||
|
⚠️ NE PAS inclure les profondeurs répétées à la fin de la description
|
||||||
|
|
||||||
|
IMPORTANT:
|
||||||
|
- Capture TOUT le texte de description, même s'il est long ou sur plusieurs lignes
|
||||||
|
- Ne pas résumer ou reformuler - garder le texte original
|
||||||
|
- Inclure TOUTES les propriétés mentionnées (couleur, texture, humidité, inclusions, etc.)
|
||||||
|
- EXCLURE ABSOLUMENT les profondeurs du champ description
|
||||||
|
- Si le texte commence par "0,30 m - Mutterboden", la description est SEULEMENT "Mutterboden"
|
||||||
|
- Séparer clairement: profondeurs dans depth_start/depth_end, matériau et propriétés dans description
|
||||||
|
- NE PAS mélanger les colonnes d'un tableau - prendre uniquement la colonne description principale
|
||||||
|
|
||||||
|
VALIDATION OBLIGATOIRE avant de retourner le JSON:
|
||||||
|
□ Toutes les couches ont une profondeur de début ET de fin
|
||||||
|
□ Les profondeurs sont continues (pas de trous entre les couches)
|
||||||
|
□ Les profondeurs sont cohérentes (début < fin)
|
||||||
|
□ Les descriptions ne contiennent PAS les profondeurs
|
||||||
|
□ Les descriptions ne contiennent PAS de noms de colonnes de classification
|
||||||
|
□ Aucune annotation technique isolée n'est présente
|
||||||
|
□ Pas de duplication de couches identiques
|
||||||
|
|
||||||
|
Retourne ce JSON:
|
||||||
|
{{
|
||||||
|
"layers": [
|
||||||
|
{{
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Texte de description SANS les profondeurs - uniquement matériau et propriétés"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
EXEMPLE CONCRET:
|
||||||
|
Texte original: "0,00 - 0,30 m Mutterboden, dunkelbraun, humos"
|
||||||
|
Résultat attendu:
|
||||||
|
{{
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Mutterboden, dunkelbraun, humos"
|
||||||
|
}}
|
||||||
|
PAS: "description": "0,00 - 0,30 m Mutterboden, dunkelbraun, humos" ❌
|
||||||
|
|
||||||
|
TEXTE DE LA PAGE {page_num}:
|
||||||
|
{text}"""
|
||||||
|
|
||||||
|
return system_prompt, user_prompt_template
|
||||||
|
|
||||||
|
def extract_text_from_pdf(self, pdf_path: Path) -> List[str]:
|
||||||
|
"""Extract text from PDF with chunking"""
|
||||||
|
logger.info(f"🔍 Extracting text from {pdf_path.name}")
|
||||||
|
|
||||||
|
total_pages = self.get_page_count(pdf_path)
|
||||||
|
logger.info(f"📄 PDF has {total_pages} pages")
|
||||||
|
|
||||||
|
if total_pages == 0:
|
||||||
|
logger.error("Could not determine page count")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Extraire par chunks de 2 pages
|
||||||
|
pages_text = []
|
||||||
|
chunk_size = 2
|
||||||
|
for start_page in range(1, total_pages + 1, chunk_size):
|
||||||
|
end_page = min(start_page + chunk_size - 1, total_pages)
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
poller = self.doc_client.begin_analyze_document(
|
||||||
|
model_id="prebuilt-document",
|
||||||
|
document=f,
|
||||||
|
pages=f"{start_page}-{end_page}"
|
||||||
|
)
|
||||||
|
result = poller.result()
|
||||||
|
for page in result.pages:
|
||||||
|
page_lines = [line.content for line in (page.lines or [])]
|
||||||
|
pages_text.append('\n'.join(page_lines))
|
||||||
|
|
||||||
|
return pages_text
|
||||||
|
|
||||||
|
def get_page_count(self, pdf_path: Path) -> int:
|
||||||
|
"""Get the number of pages in a PDF"""
|
||||||
|
try:
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
pdf_reader = PyPDF2.PdfReader(f)
|
||||||
|
return len(pdf_reader.pages)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _extract_table_text(self, table) -> str:
|
||||||
|
"""Extract text from table with better structure preservation"""
|
||||||
|
if not hasattr(table, 'cells') or not table.cells:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
max_row = max(cell.row_index for cell in table.cells)
|
||||||
|
max_col = max(cell.column_index for cell in table.cells)
|
||||||
|
table_array = [["" for _ in range(max_col + 1)] for _ in range(max_row + 1)]
|
||||||
|
|
||||||
|
for cell in table.cells:
|
||||||
|
if hasattr(cell, 'content'):
|
||||||
|
table_array[cell.row_index][cell.column_index] = cell.content
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for row_idx, row in enumerate(table_array):
|
||||||
|
row_text = " | ".join(str(cell).strip() for cell in row)
|
||||||
|
if row_text.strip():
|
||||||
|
lines.append(row_text)
|
||||||
|
# Add separator after header row
|
||||||
|
if row_idx == 0 and len(lines) == 1:
|
||||||
|
lines.append("-" * len(row_text))
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def extract_layers_from_page(self, page_text: str, page_num: int, system_prompt: str, user_prompt_template: str) -> List[Dict]:
|
||||||
|
"""Extract layers from a single page using GPT"""
|
||||||
|
if not page_text.strip():
|
||||||
|
logger.warning(f"⚠️ Page {page_num} is empty")
|
||||||
|
return []
|
||||||
|
|
||||||
|
max_chars = 8000 # todo
|
||||||
|
user_prompt = user_prompt_template.format(
|
||||||
|
page_num=page_num,
|
||||||
|
text=page_text[:max_chars]
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"🤖 Processing page {page_num} with GPT-4...")
|
||||||
|
|
||||||
|
response = self.openai_client.chat.completions.create(
|
||||||
|
model=AZURE_DEPLOYMENT,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": user_prompt}
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=4000, # todo
|
||||||
|
response_format={"type": "json_object"}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = json.loads(response.choices[0].message.content)
|
||||||
|
valid_layers = []
|
||||||
|
|
||||||
|
for layer in result.get("layers", []):
|
||||||
|
try:
|
||||||
|
depth_start = float(str(layer.get("depth_start", 0)).replace(",", "."))
|
||||||
|
depth_end = float(str(layer.get("depth_end", 0)).replace(",", "."))
|
||||||
|
description = layer.get("description", "").strip()
|
||||||
|
|
||||||
|
if (depth_start < depth_end and depth_end <= 100 and depth_start >= 0 and description):
|
||||||
|
valid_layers.append({
|
||||||
|
"page": page_num,
|
||||||
|
"depth_start": depth_start,
|
||||||
|
"depth_end": depth_end,
|
||||||
|
"description": description
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" Invalid layer data: {e}")
|
||||||
|
continue
|
||||||
|
logger.info(f" ✓ Found {len(valid_layers)} valid layers on page {page_num}")
|
||||||
|
return valid_layers
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ GPT extraction failed for page {page_num}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def merge_duplicate_layers(self, layers: List[Dict]) -> List[Dict]:
|
||||||
|
"""Merge duplicate layers across pages"""
|
||||||
|
if not layers:
|
||||||
|
return []
|
||||||
|
|
||||||
|
layers.sort(key=lambda x: (x['depth_start'], x['page']))
|
||||||
|
|
||||||
|
merged = []
|
||||||
|
current = None
|
||||||
|
|
||||||
|
for layer in layers:
|
||||||
|
if current is None:
|
||||||
|
current = layer.copy()
|
||||||
|
else:
|
||||||
|
if (abs(current['depth_start'] - layer['depth_start']) < 0.01 and
|
||||||
|
abs(current['depth_end'] - layer['depth_end']) < 0.01):
|
||||||
|
if len(layer['description']) > len(current['description']):
|
||||||
|
current['description'] = layer['description']
|
||||||
|
current['page'] = min(current['page'], layer['page'])
|
||||||
|
else:
|
||||||
|
merged.append(current)
|
||||||
|
current = layer.copy()
|
||||||
|
|
||||||
|
if current:
|
||||||
|
merged.append(current)
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def extract_layers_from_pdf(self, pdf_path: Path) -> Dict:
|
||||||
|
"""Main extraction function"""
|
||||||
|
logger.info(f"\n🚀 Starting extraction for: {pdf_path.name}")
|
||||||
|
start_time = datetime.now()
|
||||||
|
|
||||||
|
pages_text = self.extract_text_from_pdf(pdf_path)
|
||||||
|
|
||||||
|
if not pages_text:
|
||||||
|
logger.error("❌ No text extracted from PDF")
|
||||||
|
return {
|
||||||
|
"metadata": {
|
||||||
|
"source_file": str(pdf_path.resolve()),
|
||||||
|
"extraction_date": datetime.now().isoformat(),
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-4",
|
||||||
|
"pages": 0,
|
||||||
|
"layer_count": 0,
|
||||||
|
"error": "No text extracted from PDF"
|
||||||
|
},
|
||||||
|
"layers": []
|
||||||
|
}
|
||||||
|
|
||||||
|
system_prompt, user_prompt_template = self.get_prompts()
|
||||||
|
|
||||||
|
all_layers = []
|
||||||
|
for page_idx, page_text in enumerate(pages_text):
|
||||||
|
page_layers = self.extract_layers_from_page(
|
||||||
|
page_text, page_idx + 1, system_prompt, user_prompt_template
|
||||||
|
)
|
||||||
|
all_layers.extend(page_layers)
|
||||||
|
|
||||||
|
logger.info(f"\n🔄 Merging duplicate layers...")
|
||||||
|
merged_layers = self.merge_duplicate_layers(all_layers)
|
||||||
|
|
||||||
|
extraction_time = (datetime.now() - start_time).total_seconds()
|
||||||
|
|
||||||
|
logger.info(f"\n✅ Extraction completed!")
|
||||||
|
logger.info(f" Total layers found: {len(merged_layers)}")
|
||||||
|
logger.info(f" Extraction time: {extraction_time:.1f} seconds")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"metadata": {
|
||||||
|
"source_file": str(pdf_path.resolve()),
|
||||||
|
"extraction_date": datetime.now().isoformat(),
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-4",
|
||||||
|
"pages": len(pages_text),
|
||||||
|
"layer_count": len(merged_layers),
|
||||||
|
"extraction_time_seconds": round(extraction_time, 1)
|
||||||
|
},
|
||||||
|
"layers": merged_layers
|
||||||
|
}
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
||||||
|
@click.option('--output', '-o', type=click.Path(path_type=Path), help='Output JSON file path (default: same name as PDF)')
|
||||||
|
@click.option('--pretty', is_flag=True, help='Pretty print the JSON output')
|
||||||
|
def main(pdf_path: Path, output: Path, pretty: bool):
|
||||||
|
"""Extract geological layers from a single PDF file."""
|
||||||
|
click.echo("🪨 Geological Layer Extractor")
|
||||||
|
click.echo("=" * 40)
|
||||||
|
|
||||||
|
extractor = SinglePDFExtractor()
|
||||||
|
result = extractor.extract_layers_from_pdf(pdf_path)
|
||||||
|
|
||||||
|
if output is None:
|
||||||
|
output_dir = Path('output')
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
output = output_dir / pdf_path.with_suffix('.json').name
|
||||||
|
with open(output, 'w', encoding='utf-8') as f:
|
||||||
|
if pretty:
|
||||||
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||||
|
else:
|
||||||
|
json.dump(result, f, ensure_ascii=False)
|
||||||
|
|
||||||
|
click.echo(f"\n💾 Results saved to: {output}")
|
||||||
|
click.echo("\n📊 Extraction Summary:")
|
||||||
|
click.echo(f" Source: {pdf_path.name}")
|
||||||
|
click.echo(f" Pages processed: {result['metadata']['pages']}")
|
||||||
|
click.echo(f" Layers extracted: {result['metadata']['layer_count']}")
|
||||||
|
click.echo(f" Time taken: {result['metadata']['extraction_time_seconds']}s")
|
||||||
|
|
||||||
|
if result['layers']:
|
||||||
|
click.echo("\n📋 Preview of extracted layers:")
|
||||||
|
for i, layer in enumerate(result['layers'][:3], 1):
|
||||||
|
click.echo(f"\n {i}. {layer['depth_start']:.1f} - {layer['depth_end']:.1f}m")
|
||||||
|
click.echo(f" {layer['description'][:80]}{'...' if len(layer['description']) > 80 else ''}")
|
||||||
|
if len(result['layers']) > 3:
|
||||||
|
click.echo(f"\n ... and {len(result['layers']) - 3} more layers")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
72
output_man/1_layers.json
Normal file
72
output_man/1_layers.json
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/1.pdf",
|
||||||
|
"extraction_date": "2025-07-31T13:06:45.311638",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-3.5",
|
||||||
|
"pages": 2,
|
||||||
|
"layer_count": 8,
|
||||||
|
"extraction_time_seconds": 14.9
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.2,
|
||||||
|
"description": "Quartär Schluff, sehr schwach tonig, schwach feinsandig, schwach organisch, braun, steif"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.2,
|
||||||
|
"depth_end": 0.8,
|
||||||
|
"description": "Quartär Schluff, schwach feinsandig, schwach tonig, hellbraun, dunkelbraun, steif"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.8,
|
||||||
|
"depth_end": 3.2,
|
||||||
|
"description": "Quartär Sand, stark kiesig, schwach steinig, schwach schluffig, grau"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 3.2,
|
||||||
|
"depth_end": 6.4,
|
||||||
|
"description": "Sand, schluffig, schwach feinkiesig, hellbraun, ockerbraun, graubraun, Metamagmatite"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 6.4,
|
||||||
|
"depth_end": 10.0,
|
||||||
|
"description": "Plutonit, hellgrau, rot, Metamagmatite, Granit-Zersatz,"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.2,
|
||||||
|
"description": "Quartär Schluff, sehr schwach tonig, schwach feinsandig, schwach organisch, braun, steif"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 0.2,
|
||||||
|
"depth_end": 0.8,
|
||||||
|
"description": "Quartär Schluff, schwach feinsandig, schwach tonig, hellbraun, dunkelbraun, steif"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 0.8,
|
||||||
|
"depth_end": 3.2,
|
||||||
|
"description": "Quartär Sand, stark kiesig, schwach steinig, schwach schluffig, grau"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 3.2,
|
||||||
|
"depth_end": 6.4,
|
||||||
|
"description": "Sand, schluffig, schwach feinkiesig, hellbraun, ockerbraun, graubraun, Metamagmatite"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 6.4,
|
||||||
|
"depth_end": 10.0,
|
||||||
|
"description": "Plutonit, hellgrau, rot, Metamagmatite, Granit-Zersatz,"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
30
output_man/2_layers.json
Normal file
30
output_man/2_layers.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/2.pdf",
|
||||||
|
"extraction_date": "2025-07-31T13:03:32.291628",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-3.5",
|
||||||
|
"pages": 1,
|
||||||
|
"layer_count": 3,
|
||||||
|
"extraction_time_seconds": 8.67
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Mutterboden, Sand, schluffig, schwach tonig, schwach kiesig, schwach humos, erdfeucht, dunkelbraun, kalkfrei, g = kantig"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.3,
|
||||||
|
"depth_end": 4.0,
|
||||||
|
"description": "Sand, schluffig bis stark schluffig, schwach kiesig bis kiesig, halbfest, erdfeucht bis feucht, braun, graubraun, hellbraun, kalkfrei, vollständig verwittert bis zersetzt, Gneis-Zersatz, mürbe, g = kantig"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 4.0,
|
||||||
|
"depth_end": 10.0,
|
||||||
|
"description": "massige Metamorphite, erdfeucht, graubraun, rotfahl, kalkfrei, mäßig verwittert bis stark verwittert, R3-R4, Gneis, geklüftet bis stak geklüftet, z.T. zerbohrt"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
30
output_man/3_layers.json
Normal file
30
output_man/3_layers.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/3.pdf",
|
||||||
|
"extraction_date": "2025-07-31T13:06:23.295594",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-3.5",
|
||||||
|
"pages": 2,
|
||||||
|
"layer_count": 3,
|
||||||
|
"extraction_time_seconds": 10.99
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.2,
|
||||||
|
"description": "Terre végétale"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.2,
|
||||||
|
"depth_end": 1.4,
|
||||||
|
"description": "Limon marron ocre orange"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 1.4,
|
||||||
|
"depth_end": 3.0,
|
||||||
|
"description": "Argile marron avec passage grisâtre"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
72
output_man/4_layers.json
Normal file
72
output_man/4_layers.json
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/2.pdf",
|
||||||
|
"extraction_date": "2025-07-31T13:03:32.291628",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-3.5",
|
||||||
|
"pages": 1,
|
||||||
|
"layer_count": 3,
|
||||||
|
"extraction_time_seconds": 8.67
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 2.8,
|
||||||
|
"description": "Top soil: Soft, beige to grey, silty CLAY to clayey SILT. Containing root of plants and fragments of shells."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 2.8,
|
||||||
|
"depth_end": 10.0,
|
||||||
|
"description": "Loose to medium dense, green grey, fine SAND. Containing gravel, fragments of shells a little organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 10.0,
|
||||||
|
"depth_end": 20.0,
|
||||||
|
"description": "Soft, green grey, clayey SILT to silty CLAY. Containing fragments of shells and organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 20.0,
|
||||||
|
"depth_end": 24.0,
|
||||||
|
"description": "Soft, green grey, silty CLAY. Containing fragments of shells and organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 20.0,
|
||||||
|
"depth_end": 28.6,
|
||||||
|
"description": "Soft to firm, green grey, silty CLAY. Containing fragments of shells and organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 28.6,
|
||||||
|
"depth_end": 34.0,
|
||||||
|
"description": "Loose to medium dense, green grey, fine SAND. Containing fragments of shells and a little organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 4,
|
||||||
|
"depth_start": 34.0,
|
||||||
|
"depth_end": 47.3,
|
||||||
|
"description": "Soft to firm, green grey, silty CLAY. Containg bed of silty sand, fragments of shells and a little organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 5,
|
||||||
|
"depth_start": 47.3,
|
||||||
|
"depth_end": 51.5,
|
||||||
|
"description": "Frim to stiff, green grey, silty CLAY. Containing gravel and fragments of shells."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 6,
|
||||||
|
"depth_start": 51.5,
|
||||||
|
"depth_end": 57.5,
|
||||||
|
"description": "Medium dense to dense, green grey, fine SAND. Containing fragments of shells and a little organic matter."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 6,
|
||||||
|
"depth_start": 57.5,
|
||||||
|
"depth_end": 70.0,
|
||||||
|
"description": "Firm, green grey, silty CLAY. Containing beds of silty sand, fragments of shells and a little organic matter."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
36
output_man/5_layers.json
Normal file
36
output_man/5_layers.json
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/5.pdf",
|
||||||
|
"extraction_date": "2025-07-30T18:48:56.422438",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-3.5",
|
||||||
|
"pages": 2,
|
||||||
|
"layer_count": 9,
|
||||||
|
"extraction_time_seconds": 16.26
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.1,
|
||||||
|
"description": "Terre végétale"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.1,
|
||||||
|
"depth_end": 0.6,
|
||||||
|
"description": "Limon + argileux brun avec quelques silex et + de matières organiques."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.6,
|
||||||
|
"depth_end": 2.0,
|
||||||
|
"description": "Argile à silex Très charpentée, marron. Proportion de silex : ~ 80%."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 2.0,
|
||||||
|
"depth_end": 3.0,
|
||||||
|
"description": "Craie + argileuse Très altérée, blanchâtre avec silex. Proportion de silex : > 50%."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
132
output_man/6_layers.json
Normal file
132
output_man/6_layers.json
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"source_file": "/Users/tom/Desktop/project/vinci/geotech/geotech-azure/examples/sample_pdfs/5.pdf",
|
||||||
|
"extraction_date": "2025-07-30T18:48:56.422438",
|
||||||
|
"extraction_method": "Azure Document Intelligence + GPT-3.5",
|
||||||
|
"pages": 2,
|
||||||
|
"layer_count": 9,
|
||||||
|
"extraction_time_seconds": 16.26
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Silty CLAY with some rootlets; brown. Firm, moist, high plasticity (TOPSOIL)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 0.3,
|
||||||
|
"depth_end": 1.5,
|
||||||
|
"description": "Silty CLAY; light brown mottled orange. Firm, moist, high plasticity."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 1.5,
|
||||||
|
"depth_end": 2.9,
|
||||||
|
"description": "Sandy SILT with some carbonaceous intrusions; light grey streaked orange. Stiff to very stiff, moist, low plasticity; sand, fine to coarse (Completely degraded NORTHLAND ALLOCHTHON GRADE 5)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 2.9,
|
||||||
|
"depth_end": 3.0,
|
||||||
|
"description": "Highly degraded, light grey streaked orange, massive LIMESTONE; extremely weak, recovered as Sandy SILT; Very stiff, moist, low plasticity (NORTHLAND ALLOCHTHON GRADE 4)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 3.0,
|
||||||
|
"depth_end": 5.5,
|
||||||
|
"description": "Moderately degraded, light bluish grey, massive LIMESTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 3)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 5.5,
|
||||||
|
"depth_end": 5.8,
|
||||||
|
"description": "Highly degraded, light bluish grey, massive LIMESTONE; extremely weak, recovered as Sandy SILT; Very stiff, moist, low plasticity (NORTHLAND ALLOCHTHON GRADE 4)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 5.8,
|
||||||
|
"depth_end": 6.39,
|
||||||
|
"description": "Moderately degraded, light bluish grey, massive LIMESTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 3)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 6.39,
|
||||||
|
"depth_end": 7.05,
|
||||||
|
"description": "Highly degraded, light bluish grey, massive LIMESTONE; extremely weak, recovered as Silty fine to coarse SAND with some minor fine gravel; Medium Dense to Dense, wet, poorly graded (NORTHLAND ALLOCHTHON GRADE 4)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 7.05,
|
||||||
|
"depth_end": 7.8,
|
||||||
|
"description": "Highly degraded, light grey, massive LIMESTONE; extremely weak, recovered as Sandy SILT; Very stiff, moist, low plasticity (NORTHLAND ALLOCHTHON GRADE 4)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"depth_start": 7.8,
|
||||||
|
"depth_end": 9.7,
|
||||||
|
"description": "Moderately degraded, grey, thinly bedded MUDSTONE; extremely weak to very weak; bedding is moderately inclined. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 3)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 9.7,
|
||||||
|
"depth_end": 11.7,
|
||||||
|
"description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided. (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 11.7,
|
||||||
|
"depth_end": 15.8,
|
||||||
|
"description": "Slightly weathered, grey, moderately thickly bedded, fine SANDSTONE; weak with thinly interbedded dark grey SILTSTONE; very weak. Bedding is steeply inclined."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 2,
|
||||||
|
"depth_start": 15.8,
|
||||||
|
"depth_end": 17.4,
|
||||||
|
"description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak. Highly"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 17.4,
|
||||||
|
"depth_end": 18.0,
|
||||||
|
"description": "Slightly degraded, dark grey, massive MUDSTONE; weak with some fine to coarse gravel sized, subrounded limestone clasts. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 18.0,
|
||||||
|
"depth_end": 21.3,
|
||||||
|
"description": "Slightly degraded, interbedded, dark grey MUDSTONE and bluish grey LIMESTONE; extremely weak. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 21.3,
|
||||||
|
"depth_end": 22.75,
|
||||||
|
"description": "Slighty degraded, light grey MUDSTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 22.75,
|
||||||
|
"depth_end": 23.4,
|
||||||
|
"description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 23.4,
|
||||||
|
"depth_end": 23.7,
|
||||||
|
"description": "Slighty degraded, light grey MUDSTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 3,
|
||||||
|
"depth_start": 23.7,
|
||||||
|
"depth_end": 24.45,
|
||||||
|
"description": "Slightly degraded, bluish grey, massive LIMESTONE; extremely weak. Highly microfractured and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"page": 4,
|
||||||
|
"depth_start": 24.45,
|
||||||
|
"depth_end": 30.0,
|
||||||
|
"description": "Slighty degraded, light grey MUDSTONE; extremely weak to very weak. Highly micro-fractured, sheared and slickensided (NORTHLAND ALLOCHTHON GRADE 2)."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
475
prompt_optimizer.py
Normal file
475
prompt_optimizer.py
Normal file
@@ -0,0 +1,475 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple Prompt Tester - Test current prompt without optimization
|
||||||
|
"""
|
||||||
|
import click
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import PyPDF2
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from azure.ai.formrecognizer import DocumentAnalysisClient
|
||||||
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
from openai import AzureOpenAI
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Configuration Azure Document Intelligence
|
||||||
|
AZURE_DOC_ENDPOINT = os.getenv("AZURE_DOC_ENDPOINT")
|
||||||
|
AZURE_DOC_KEY = os.getenv("AZURE_DOC_KEY")
|
||||||
|
|
||||||
|
# Configuration Azure OpenAI
|
||||||
|
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
||||||
|
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
||||||
|
AZURE_DEPLOYMENT = os.getenv("AZURE_DEPLOYMENT", "gpt-4.1")
|
||||||
|
API_VERSION = os.getenv("API_VERSION", "2024-12-01-preview")
|
||||||
|
|
||||||
|
if not all([AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY]):
|
||||||
|
raise ValueError(
|
||||||
|
"❌ Missing Azure credentials! Please check your .env file.\n"
|
||||||
|
"Required variables: AZURE_DOC_ENDPOINT, AZURE_DOC_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY"
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
logger.info("✅ Environment variables loaded successfully")
|
||||||
|
logger.info(f"📍 Document Intelligence endpoint: {AZURE_DOC_ENDPOINT}")
|
||||||
|
logger.info(f"🤖 OpenAI deployment: {AZURE_DEPLOYMENT}")
|
||||||
|
|
||||||
|
class PromptTester:
|
||||||
|
"""Test a single prompt on PDFs"""
|
||||||
|
def __init__(self):
|
||||||
|
self.doc_client = DocumentAnalysisClient(
|
||||||
|
endpoint=AZURE_DOC_ENDPOINT,
|
||||||
|
credential=AzureKeyCredential(AZURE_DOC_KEY)
|
||||||
|
)
|
||||||
|
self.openai_client = AzureOpenAI(
|
||||||
|
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
||||||
|
api_key=AZURE_OPENAI_KEY,
|
||||||
|
api_version=API_VERSION
|
||||||
|
)
|
||||||
|
self.ocr_cache = {}
|
||||||
|
|
||||||
|
def get_current_prompt(self) -> Tuple[str, str]:
|
||||||
|
"""Get the current working prompt"""
|
||||||
|
system_prompt = """Tu es un expert géotechnicien spécialisé dans l'analyse de logs de forage.
|
||||||
|
Ta tâche est d'extraire UNIQUEMENT les couches géologiques avec leurs profondeurs et descriptions COMPLÈTES.
|
||||||
|
|
||||||
|
RÈGLES CRITIQUES:
|
||||||
|
1. Chaque couche DOIT avoir une profondeur de début et de fin en mètres
|
||||||
|
2. La description doit contenir TOUT le texte associé à la couche (même sur plusieurs lignes)
|
||||||
|
3. Notation décimale: 0,3 ou 0.3 = 0.3m (PAS 3m!)
|
||||||
|
4. Profondeurs typiques: 0-0.3m, 0.3-1m, 1-4m, 4-10m, etc.
|
||||||
|
5. NE PAS séparer ou classifier le matériau - tout dans la description
|
||||||
|
6. IGNORER les mots isolés qui ne sont pas dans la colonne description
|
||||||
|
|
||||||
|
RÈGLE DE CONTINUITÉ DES COUCHES:
|
||||||
|
⚠️ Une couche géologique est CONTINUE entre deux profondeurs
|
||||||
|
⚠️ Si tu vois une profondeur isolée suivie de description, cherche OBLIGATOIREMENT la profondeur de fin
|
||||||
|
⚠️ Les profondeurs de fin peuvent être:
|
||||||
|
- Sur la même ligne après la description
|
||||||
|
- Sur une ligne suivante
|
||||||
|
- Au début de la couche suivante (la fin d'une couche = début de la suivante)
|
||||||
|
⚠️ NE JAMAIS fragmenter une description continue en plusieurs couches
|
||||||
|
|
||||||
|
GESTION DES PAGES:
|
||||||
|
⚠️ Les couches peuvent continuer d'une page à l'autre
|
||||||
|
⚠️ Sur une nouvelle page, si la première profondeur n'est pas 0.0m, c'est la continuation de la page précédente
|
||||||
|
⚠️ Ne pas dupliquer les couches identiques sur plusieurs pages
|
||||||
|
|
||||||
|
SORTIE: JSON valide uniquement"""
|
||||||
|
|
||||||
|
user_prompt_template = """Extrait toutes les couches géologiques de cette page de log de forage.
|
||||||
|
|
||||||
|
CE QUI EST UNE COUCHE GÉOLOGIQUE:
|
||||||
|
✓ Descriptions de matériaux: sol, roche, sable, argile, limon, remblai, terre végétale, etc.
|
||||||
|
✓ A une plage de profondeur ET des propriétés de matériau
|
||||||
|
✓ Décrit les caractéristiques physiques: couleur, consistance, humidité, altération, granulométrie
|
||||||
|
|
||||||
|
CE QUI N'EST PAS UNE COUCHE (IGNORER):
|
||||||
|
✗ Descriptions d'images ou légendes
|
||||||
|
✗ Métadonnées du projet (coordonnées, dates, noms d'entreprise)
|
||||||
|
✗ Descriptions d'équipement ou de méthodologie
|
||||||
|
✗ Données administratives ou commentaires sur la procédure
|
||||||
|
✗ Mots isolés dans d'autres colonnes (classifications, formations géologiques, etc.)
|
||||||
|
|
||||||
|
EXCLUSIONS ABSOLUES - NE JAMAIS extraire:
|
||||||
|
✗ Annotations techniques isolées: CZ, JT, JTx5, etc.
|
||||||
|
✗ Mesures techniques sans description de matériau
|
||||||
|
✗ Échantillons ("Undisturbed sample", "Disturbed sample", "sample", "échantillon")
|
||||||
|
✗ Lignes contenant uniquement des angles, pressions ou mesures
|
||||||
|
✗ Références à des essais ou tests sans description de sol
|
||||||
|
|
||||||
|
ALGORITHME DE DÉTECTION DES PROFONDEURS:
|
||||||
|
1. Pour chaque nombre qui ressemble à une profondeur (X.X m ou X,X m):
|
||||||
|
- Si suivi de "-" ou "à" ou "bis" → profondeur de début, chercher la fin
|
||||||
|
- Si précédé de "-" ou "à" ou "bis" → profondeur de fin
|
||||||
|
- Si isolé → vérifier le contexte:
|
||||||
|
* Début de description → profondeur de début
|
||||||
|
* Fin de description → profondeur de fin
|
||||||
|
* Nouvelle ligne → début de nouvelle couche
|
||||||
|
2. TOUJOURS vérifier la cohérence: fin couche N = début couche N+1
|
||||||
|
3. Si une couche n'a pas de profondeur de fin explicite, elle va jusqu'au début de la couche suivante
|
||||||
|
|
||||||
|
MOTIFS À RECHERCHER:
|
||||||
|
- "0.00-0.30m: Terre végétale, brun foncé..."
|
||||||
|
- "0,3 à 1,0m Sable fin beige..."
|
||||||
|
- "de 1m à 4m: Argile plastique grise..."
|
||||||
|
- "1.00 - 4.00 ARGILE limoneuse..."
|
||||||
|
- "0.3 m Description... 4.0 m" (profondeurs séparées)
|
||||||
|
- Tableaux avec colonnes profondeur/description
|
||||||
|
|
||||||
|
IDENTIFICATION DES COLONNES DE TABLEAU:
|
||||||
|
⚠️ Dans un tableau, identifier VISUELLEMENT les colonnes avant d'extraire
|
||||||
|
⚠️ Colonnes à IGNORER complètement:
|
||||||
|
- Formation géologique (Quaternaire, Tertiaire, Quaternaire, etc.)
|
||||||
|
- Stratigraphie
|
||||||
|
- Classification (Colluvions, Alluvions, etc.)
|
||||||
|
- Symboles ou codes isolés
|
||||||
|
- Âge géologique
|
||||||
|
⚠️ Colonnes à UTILISER:
|
||||||
|
- Description du matériau
|
||||||
|
- Propriétés physiques (couleur, texture, consistance)
|
||||||
|
⚠️ Si doute sur une colonne, vérifier si elle contient des propriétés physiques détaillées
|
||||||
|
|
||||||
|
RÈGLE CRITIQUE DE SÉPARATION:
|
||||||
|
⚠️ Les profondeurs (0-0.3m, 0.30m, etc.) ne font JAMAIS partie de la description
|
||||||
|
⚠️ Si une ligne contient "0,30 m - Mutterboden, braun", alors:
|
||||||
|
- depth_start: 0.0, depth_end: 0.3
|
||||||
|
- description: "Mutterboden, braun" (SANS les profondeurs)
|
||||||
|
⚠️ Retirer TOUS les préfixes de profondeur du champ description
|
||||||
|
⚠️ NE PAS inclure les profondeurs répétées à la fin de la description
|
||||||
|
|
||||||
|
IMPORTANT:
|
||||||
|
- Capture TOUT le texte de description, même s'il est long ou sur plusieurs lignes
|
||||||
|
- Ne pas résumer ou reformuler - garder le texte original
|
||||||
|
- Inclure TOUTES les propriétés mentionnées (couleur, texture, humidité, inclusions, etc.)
|
||||||
|
- EXCLURE ABSOLUMENT les profondeurs du champ description
|
||||||
|
- Si le texte commence par "0,30 m - Mutterboden", la description est SEULEMENT "Mutterboden"
|
||||||
|
- Séparer clairement: profondeurs dans depth_start/depth_end, matériau et propriétés dans description
|
||||||
|
- NE PAS mélanger les colonnes d'un tableau - prendre uniquement la colonne description principale
|
||||||
|
|
||||||
|
VALIDATION OBLIGATOIRE avant de retourner le JSON:
|
||||||
|
□ Toutes les couches ont une profondeur de début ET de fin
|
||||||
|
□ Les profondeurs sont continues (pas de trous entre les couches)
|
||||||
|
□ Les profondeurs sont cohérentes (début < fin)
|
||||||
|
□ Les descriptions ne contiennent PAS les profondeurs
|
||||||
|
□ Les descriptions ne contiennent PAS de noms de colonnes de classification
|
||||||
|
□ Aucune annotation technique isolée n'est présente
|
||||||
|
□ Pas de duplication de couches identiques
|
||||||
|
|
||||||
|
Retourne ce JSON:
|
||||||
|
{{
|
||||||
|
"layers": [
|
||||||
|
{{
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Texte de description SANS les profondeurs - uniquement matériau et propriétés"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
EXEMPLE CONCRET:
|
||||||
|
Texte original: "0,00 - 0,30 m Mutterboden, dunkelbraun, humos"
|
||||||
|
Résultat attendu:
|
||||||
|
{{
|
||||||
|
"depth_start": 0.0,
|
||||||
|
"depth_end": 0.3,
|
||||||
|
"description": "Mutterboden, dunkelbraun, humos"
|
||||||
|
}}
|
||||||
|
PAS: "description": "0,00 - 0,30 m Mutterboden, dunkelbraun, humos" ❌
|
||||||
|
|
||||||
|
TEXTE DE LA PAGE {page_num}:
|
||||||
|
{text}"""
|
||||||
|
|
||||||
|
return system_prompt, user_prompt_template
|
||||||
|
|
||||||
|
def extract_text_from_pdf(self, pdf_path: Path) -> List[str]:
|
||||||
|
"""Extract text from PDF with chunking"""
|
||||||
|
logger.info(f"🔍 Extracting text from {pdf_path.name}")
|
||||||
|
|
||||||
|
total_pages = self.get_page_count(pdf_path)
|
||||||
|
logger.info(f"📄 PDF has {total_pages} pages")
|
||||||
|
|
||||||
|
if total_pages == 0:
|
||||||
|
logger.error("Could not determine page count")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Extraire par chunks de 2 pages
|
||||||
|
pages_text = []
|
||||||
|
chunk_size = 2
|
||||||
|
for start_page in range(1, total_pages + 1, chunk_size):
|
||||||
|
end_page = min(start_page + chunk_size - 1, total_pages)
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
poller = self.doc_client.begin_analyze_document(
|
||||||
|
model_id="prebuilt-document",
|
||||||
|
document=f,
|
||||||
|
pages=f"{start_page}-{end_page}"
|
||||||
|
)
|
||||||
|
result = poller.result()
|
||||||
|
for page in result.pages:
|
||||||
|
page_lines = [line.content for line in (page.lines or [])]
|
||||||
|
pages_text.append('\n'.join(page_lines))
|
||||||
|
|
||||||
|
return pages_text
|
||||||
|
|
||||||
|
def get_page_count(self, pdf_path: Path) -> int:
|
||||||
|
"""Get the number of pages in a PDF"""
|
||||||
|
try:
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
pdf_reader = PyPDF2.PdfReader(f)
|
||||||
|
return len(pdf_reader.pages)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _extract_table_text(self, table) -> str:
|
||||||
|
"""Extract text from table"""
|
||||||
|
if not hasattr(table, 'cells') or not table.cells:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
max_row = max(cell.row_index for cell in table.cells)
|
||||||
|
max_col = max(cell.column_index for cell in table.cells)
|
||||||
|
table_array = [["" for _ in range(max_col + 1)] for _ in range(max_row + 1)]
|
||||||
|
|
||||||
|
for cell in table.cells:
|
||||||
|
if hasattr(cell, 'content'):
|
||||||
|
table_array[cell.row_index][cell.column_index] = cell.content
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for row in table_array:
|
||||||
|
row_text = " | ".join(str(cell).strip() for cell in row)
|
||||||
|
if row_text.strip():
|
||||||
|
lines.append(row_text)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def test_prompt_on_pdf(self, system_prompt: str, user_prompt_template: str, pdf_path: Path) -> List[Dict]:
|
||||||
|
"""Test prompt on a single PDF"""
|
||||||
|
pages_text = self.extract_text_from_pdf(pdf_path)
|
||||||
|
all_layers = []
|
||||||
|
|
||||||
|
for page_idx, page_text in enumerate(pages_text):
|
||||||
|
if not page_text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
user_prompt = user_prompt_template.format(
|
||||||
|
page_num=page_idx + 1,
|
||||||
|
text=page_text[:4000]
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.openai_client.chat.completions.create(
|
||||||
|
model=AZURE_DEPLOYMENT,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": user_prompt}
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=3000,
|
||||||
|
response_format={"type": "json_object"}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = json.loads(response.choices[0].message.content)
|
||||||
|
|
||||||
|
for layer in result.get("layers", []):
|
||||||
|
try:
|
||||||
|
depth_start = float(str(layer.get("depth_start", 0)).replace(",", "."))
|
||||||
|
depth_end = float(str(layer.get("depth_end", 0)).replace(",", "."))
|
||||||
|
description = layer.get("description", "").strip()
|
||||||
|
|
||||||
|
if (depth_start < depth_end and depth_end <= 100 and
|
||||||
|
depth_start >= 0 and description):
|
||||||
|
all_layers.append({
|
||||||
|
"page": page_idx + 1,
|
||||||
|
"depth_start": depth_start,
|
||||||
|
"depth_end": depth_end,
|
||||||
|
"description": description
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"GPT extraction failed for page {page_idx + 1}: {e}")
|
||||||
|
|
||||||
|
return sorted(all_layers, key=lambda x: (x['page'], x['depth_start']))
|
||||||
|
|
||||||
|
def compare_results(self, extracted: List[Dict], expected: List[Dict]) -> Dict:
|
||||||
|
"""Compare extracted layers with expected results"""
|
||||||
|
exact_matches = 0
|
||||||
|
depth_matches = 0
|
||||||
|
|
||||||
|
for exp in expected:
|
||||||
|
for ext in extracted:
|
||||||
|
if (ext['page'] == exp['page'] and
|
||||||
|
abs(ext['depth_start'] - exp['depth_start']) < 0.01 and
|
||||||
|
abs(ext['depth_end'] - exp['depth_end']) < 0.01):
|
||||||
|
depth_matches += 1
|
||||||
|
|
||||||
|
desc_sim = SequenceMatcher(None,
|
||||||
|
ext['description'].lower(),
|
||||||
|
exp['description'].lower()
|
||||||
|
).ratio()
|
||||||
|
|
||||||
|
if desc_sim == 1:
|
||||||
|
exact_matches += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(expected) > 0:
|
||||||
|
similarity = len([e for e in extracted if any(
|
||||||
|
abs(e['depth_start'] - x['depth_start']) < 0.5 and
|
||||||
|
abs(e['depth_end'] - x['depth_end']) < 0.5 and
|
||||||
|
e['page'] == x['page'] for x in expected
|
||||||
|
)]) / len(expected)
|
||||||
|
else:
|
||||||
|
similarity = 1.0 if len(extracted) == 0 else 0.0
|
||||||
|
|
||||||
|
return {
|
||||||
|
'similarity': similarity,
|
||||||
|
'exact_matches': exact_matches,
|
||||||
|
'depth_matches': depth_matches,
|
||||||
|
'extracted_count': len(extracted),
|
||||||
|
'expected_count': len(expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_all_pdfs(self, examples_dir: Path, expected_dir: Path):
|
||||||
|
"""Test prompt on all PDFs"""
|
||||||
|
test_pairs = []
|
||||||
|
for json_file in expected_dir.glob("*_layers.json"):
|
||||||
|
pdf_name = json_file.stem.replace("_layers", "") + ".pdf"
|
||||||
|
pdf_path = examples_dir / pdf_name
|
||||||
|
if pdf_path.exists():
|
||||||
|
test_pairs.append((pdf_path, json_file))
|
||||||
|
|
||||||
|
if not test_pairs:
|
||||||
|
click.echo("❌ No valid PDF/JSON pairs found!")
|
||||||
|
return
|
||||||
|
|
||||||
|
click.echo(f"\n📚 Found {len(test_pairs)} test PDFs")
|
||||||
|
|
||||||
|
system_prompt, user_prompt_template = self.get_current_prompt()
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
total_similarity = 0
|
||||||
|
|
||||||
|
with click.progressbar(test_pairs, label='Testing PDFs') as pairs:
|
||||||
|
for pdf_path, json_path in pairs:
|
||||||
|
try:
|
||||||
|
with open(json_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
if isinstance(data, dict) and 'layers' in data:
|
||||||
|
expected_layers = data['layers']
|
||||||
|
elif isinstance(data, list):
|
||||||
|
expected_layers = data
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown format in {json_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
extracted_layers = self.test_prompt_on_pdf(
|
||||||
|
system_prompt, user_prompt_template, pdf_path
|
||||||
|
)
|
||||||
|
|
||||||
|
comparison = self.compare_results(extracted_layers, expected_layers)
|
||||||
|
results[pdf_path.name] = comparison
|
||||||
|
total_similarity += comparison['similarity']
|
||||||
|
|
||||||
|
single_result = {
|
||||||
|
"metadata": {
|
||||||
|
"source_file": str(pdf_path.resolve()),
|
||||||
|
"expected_file": str(json_path.resolve()),
|
||||||
|
"extraction_time": datetime.now().isoformat(),
|
||||||
|
"extracted_count": comparison['extracted_count'],
|
||||||
|
"expected_count": comparison['expected_count']
|
||||||
|
},
|
||||||
|
"layers": extracted_layers
|
||||||
|
}
|
||||||
|
|
||||||
|
output_dir = Path("output")
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_file_path = output_dir / f"{pdf_path.stem}_test_result.json"
|
||||||
|
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as out_f:
|
||||||
|
json.dump(single_result, out_f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
click.echo(f"💾 Résultat sauvegardé : {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error with {pdf_path.name}: {e}")
|
||||||
|
results[pdf_path.name] = {
|
||||||
|
'similarity': 0,
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
click.echo("\n" + "="*60)
|
||||||
|
click.echo("📊 TEST RESULTS")
|
||||||
|
click.echo("="*60)
|
||||||
|
|
||||||
|
for pdf_name, result in results.items():
|
||||||
|
if 'error' in result:
|
||||||
|
click.echo(f"\n❌ {pdf_name}: ERROR - {result['error']}")
|
||||||
|
else:
|
||||||
|
click.echo(f"\n📄 {pdf_name}:")
|
||||||
|
click.echo(f" Similarity: {result['similarity']:.1%}")
|
||||||
|
click.echo(f" Extracted: {result['extracted_count']} layers")
|
||||||
|
click.echo(f" Expected: {result['expected_count']} layers")
|
||||||
|
click.echo(f" Exact matches: {result['exact_matches']}")
|
||||||
|
click.echo(f" Depth matches: {result['depth_matches']}")
|
||||||
|
|
||||||
|
avg_similarity = total_similarity / len(test_pairs) if test_pairs else 0
|
||||||
|
click.echo("\n" + "="*60)
|
||||||
|
click.echo(f"📈 OVERALL: {avg_similarity:.1%} average similarity")
|
||||||
|
click.echo("="*60)
|
||||||
|
|
||||||
|
output_file = Path("output/test_results.json")
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump({
|
||||||
|
'timestamp': datetime.now().isoformat(),
|
||||||
|
'average_similarity': avg_similarity,
|
||||||
|
'results': results
|
||||||
|
}, f, indent=2)
|
||||||
|
|
||||||
|
click.echo(f"\n💾 Detailed results saved to: {output_file}")
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--examples-dir', '-e', default='examples',help='Directory containing test PDFs')
|
||||||
|
@click.option('--expected-dir', '-x', default='output_man', help='Directory containing expected results')
|
||||||
|
@click.option('--pdf', '-p', help='Test a single PDF only')
|
||||||
|
def main(examples_dir, expected_dir, pdf):
|
||||||
|
"""Test current prompt on PDFs without optimization"""
|
||||||
|
|
||||||
|
click.echo("🧪 Simple Prompt Tester")
|
||||||
|
click.echo("=" * 30)
|
||||||
|
|
||||||
|
tester = PromptTester()
|
||||||
|
|
||||||
|
if pdf:
|
||||||
|
pdf_path = Path(pdf)
|
||||||
|
if not pdf_path.exists():
|
||||||
|
raise click.ClickException(f"PDF not found: {pdf}")
|
||||||
|
|
||||||
|
click.echo(f"\n📄 Testing single PDF: {pdf_path.name}")
|
||||||
|
|
||||||
|
system_prompt, user_prompt_template = tester.get_current_prompt()
|
||||||
|
layers = tester.test_prompt_on_pdf(system_prompt, user_prompt_template, pdf_path)
|
||||||
|
|
||||||
|
click.echo(f"\n✅ Extracted {len(layers)} layers:")
|
||||||
|
for i, layer in enumerate(layers, 1):
|
||||||
|
click.echo(f"\n{i}. Depth: {layer['depth_start']:.2f} - {layer['depth_end']:.2f}m")
|
||||||
|
click.echo(f" Page: {layer['page']}")
|
||||||
|
click.echo(f" Description: {layer['description']}")
|
||||||
|
else:
|
||||||
|
tester.test_all_pdfs(Path(examples_dir), Path(expected_dir))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
36
requirements.txt
Normal file
36
requirements.txt
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# Azure services
|
||||||
|
azure-ai-formrecognizer==3.3.0
|
||||||
|
azure-core>=1.29.0
|
||||||
|
openai>=1.0.0
|
||||||
|
|
||||||
|
# PDF processing
|
||||||
|
pdf2image==1.17.0
|
||||||
|
Pillow>=10.0.0
|
||||||
|
pytesseract==0.3.13 # Fallback OCR
|
||||||
|
pdfplumber==0.10.3 # Enhanced table extraction
|
||||||
|
PyPDF2>=3.0.1
|
||||||
|
|
||||||
|
# CLI and utilities
|
||||||
|
click==8.1.7
|
||||||
|
pathlib2==2.3.7.post1
|
||||||
|
|
||||||
|
# Data handling
|
||||||
|
numpy>=1.24.0
|
||||||
|
python-dateutil>=2.8.2
|
||||||
|
|
||||||
|
# For better logging
|
||||||
|
colorlog>=6.7.0
|
||||||
|
|
||||||
|
# Development tools (optional)
|
||||||
|
pytest>=7.4.0
|
||||||
|
black>=23.0.0
|
||||||
|
flake8>=6.0.0
|
||||||
|
|
||||||
|
# Type hints
|
||||||
|
typing-extensions>=4.0.0
|
||||||
|
dataclasses>=0.6
|
||||||
|
|
||||||
|
# For text processing
|
||||||
|
nltk>=3.8.1
|
||||||
|
textstat>=0.7.3
|
||||||
|
python-dotenv==1.0.0
|
||||||
Reference in New Issue
Block a user