Edge AI Deployment: Running Models Everywhere

Edge AI has become mainstream, enabling intelligent applications that run directly on devices. This post explores practical patterns for deploying AI models to the edge, covering model optimization, runtime selection, and update mechanisms.

Why Edge AI?

Edge deployment offers compelling advantages:

Latency: Sub-10ms inference vs. hundreds of ms for cloud
Privacy: Data stays on device
Cost: No per-request API charges
Offline: Works without connectivity
Bandwidth: No need to upload data

Model Optimization Pipeline

Preparing models for edge deployment:

import onnx
import torch
from optimum.onnxruntime import ORTModelForSequenceClassification
from onnxruntime.quantization import quantize_dynamic

class EdgeOptimizer:
    """Optimize models for edge deployment"""

    def __init__(self, model_path: str):
        self.model_path = model_path

    async def optimize_for_edge(
        self,
        target_platform: str,
        optimization_level: str = "aggressive"
    ) -> Dict:
        """Complete optimization pipeline"""

        steps_completed = []

        # Step 1: Convert to ONNX
        onnx_path = await self._convert_to_onnx()
        steps_completed.append("onnx_conversion")

        # Step 2: Optimize graph
        optimized_path = await self._optimize_onnx_graph(
            onnx_path,
            optimization_level
        )
        steps_completed.append("graph_optimization")

        # Step 3: Quantization
        quantized_path = await self._quantize_model(
            optimized_path,
            target_platform
        )
        steps_completed.append("quantization")

        # Step 4: Platform-specific compilation
        compiled_path = await self._compile_for_platform(
            quantized_path,
            target_platform
        )
        steps_completed.append("platform_compilation")

        # Step 5: Validation
        validation_results = await self._validate_model(
            compiled_path,
            self.model_path
        )

        return {
            'optimized_model_path': compiled_path,
            'steps_completed': steps_completed,
            'size_reduction': validation_results['size_reduction'],
            'speedup': validation_results['speedup'],
            'accuracy_delta': validation_results['accuracy_delta']
        }

    async def _convert_to_onnx(self) -> str:
        """Convert PyTorch/TF model to ONNX"""

        # Load model
        model = torch.load(self.model_path)
        model.eval()

        # Create dummy input
        dummy_input = torch.randn(1, 3, 224, 224)

        # Export to ONNX
        onnx_path = self.model_path.replace('.pt', '.onnx')

        torch.onnx.export(
            model,
            dummy_input,
            onnx_path,
            export_params=True,
            opset_version=14,
            do_constant_folding=True,
            input_names=['input'],
            output_names=['output'],
            dynamic_axes={
                'input': {0: 'batch_size'},
                'output': {0: 'batch_size'}
            }
        )

        return onnx_path

    async def _optimize_onnx_graph(
        self,
        onnx_path: str,
        level: str
    ) -> str:
        """Optimize ONNX graph"""

        from onnxruntime.transformers import optimizer

        # Load model
        model = onnx.load(onnx_path)

        # Optimization settings
        optimization_options = {
            'fast': ['eliminate_identity', 'eliminate_nop_transpose'],
            'balanced': ['eliminate_identity', 'eliminate_nop_transpose', 'fuse_bn_into_conv'],
            'aggressive': ['eliminate_identity', 'eliminate_nop_transpose', 'fuse_bn_into_conv', 'fuse_add_bias_into_conv']
        }

        passes = optimization_options.get(level, optimization_options['balanced'])

        # Apply optimizations
        optimized_model = optimizer.optimize_model(
            onnx_path,
            optimization_options=passes
        )

        optimized_path = onnx_path.replace('.onnx', '_optimized.onnx')
        optimized_model.save_model_to_file(optimized_path)

        return optimized_path

    async def _quantize_model(
        self,
        model_path: str,
        target_platform: str
    ) -> str:
        """Quantize model for target platform"""

        # Platform-specific quantization strategies
        quant_config = {
            'mobile': {'mode': 'int8', 'per_channel': True},
            'iot': {'mode': 'int8', 'per_channel': False},
            'browser': {'mode': 'dynamic', 'per_channel': False},
        }

        config = quant_config.get(target_platform, quant_config['mobile'])

        quantized_path = model_path.replace('.onnx', '_quantized.onnx')

        if config['mode'] == 'int8':
            from onnxruntime.quantization import quantize_static, CalibrationDataReader

            # Calibration data reader
            calibration_data = self._create_calibration_data()

            quantize_static(
                model_path,
                quantized_path,
                calibration_data,
                per_channel=config['per_channel']
            )
        else:
            # Dynamic quantization
            quantize_dynamic(
                model_path,
                quantized_path,
                weight_type=QuantType.QInt8
            )

        return quantized_path

    async def _compile_for_platform(
        self,
        model_path: str,
        platform: str
    ) -> str:
        """Platform-specific compilation"""

        if platform == 'browser':
            # Compile to WebAssembly
            return await self._compile_to_wasm(model_path)

        elif platform == 'mobile':
            # Compile for mobile (CoreML, TFLite)
            return await self._compile_to_mobile(model_path)

        elif platform == 'iot':
            # Compile for microcontrollers
            return await self._compile_to_micro(model_path)

        return model_path

    async def _compile_to_wasm(self, model_path: str) -> str:
        """Compile ONNX model to WASM"""

        wasm_path = model_path.replace('.onnx', '.wasm')

        # Use ONNX Runtime Web
        # This is conceptual - actual implementation depends on runtime

        return wasm_path

    async def _validate_model(
        self,
        optimized_path: str,
        original_path: str
    ) -> Dict:
        """Validate optimized model against original"""

        import os

        # Size comparison
        original_size = os.path.getsize(original_path)
        optimized_size = os.path.getsize(optimized_path)
        size_reduction = (original_size - optimized_size) / original_size

        # Inference speed comparison
        original_latency = await self._benchmark_model(original_path)
        optimized_latency = await self._benchmark_model(optimized_path)
        speedup = original_latency / optimized_latency

        # Accuracy comparison
        accuracy_delta = await self._compare_accuracy(
            original_path,
            optimized_path
        )

        return {
            'size_reduction': size_reduction,
            'speedup': speedup,
            'accuracy_delta': accuracy_delta
        }

Edge Runtime Architecture

Efficient model serving at the edge:

use wasm_bindgen::prelude::*;
use ort::{Environment, SessionBuilder, Value};
use std::sync::Arc;

// WASM-compatible inference runtime
#[wasm_bindgen]
pub struct EdgeRuntime {
    environment: Arc<Environment>,
    model_data: Vec<u8>,
}

#[wasm_bindgen]
impl EdgeRuntime {
    #[wasm_bindgen(constructor)]
    pub fn new(model_bytes: &[u8]) -> Result<EdgeRuntime, JsValue> {
        let environment = Arc::new(
            Environment::builder()
                .with_name("edge-runtime")
                .build()
                .map_err(|e| JsValue::from_str(&e.to_string()))?
        );

        Ok(EdgeRuntime {
            environment,
            model_data: model_bytes.to_vec(),
        })
    }

    #[wasm_bindgen]
    pub async fn infer(&self, input_data: &[f32]) -> Result<Vec<f32>, JsValue> {
        // Create session
        let session = SessionBuilder::new(&self.environment)
            .map_err(|e| JsValue::from_str(&e.to_string()))?
            .with_model_from_memory(&self.model_data)
            .map_err(|e| JsValue::from_str(&e.to_string()))?;

        // Prepare input tensor
        let input_shape = vec![1, input_data.len()];
        let input_tensor = Value::from_array(
            session.allocator(),
            &input_shape,
            input_data
        ).map_err(|e| JsValue::from_str(&e.to_string()))?;

        // Run inference
        let outputs = session
            .run(vec![input_tensor])
            .map_err(|e| JsValue::from_str(&e.to_string()))?;

        // Extract output
        let output = outputs[0]
            .try_extract()
            .map_err(|e| JsValue::from_str(&e.to_string()))?
            .view()
            .to_owned();

        Ok(output.as_slice().unwrap().to_vec())
    }

    // Model caching
    pub async fn cache_model(&self) -> Result<(), JsValue> {
        // Store model in IndexedDB or LocalStorage
        let window = web_sys::window().unwrap();
        let storage = window.local_storage()
            .map_err(|_| JsValue::from_str("Storage not available"))?
            .ok_or_else(|| JsValue::from_str("Storage not available"))?;

        let model_b64 = base64::encode(&self.model_data);

        storage.set_item("edge_model", &model_b64)
            .map_err(|_| JsValue::from_str("Failed to cache model"))?;

        Ok(())
    }

    pub async fn load_cached_model() -> Result<Option<Vec<u8>>, JsValue> {
        let window = web_sys::window().unwrap();
        let storage = window.local_storage()
            .map_err(|_| JsValue::from_str("Storage not available"))?
            .ok_or_else(|| JsValue::from_str("Storage not available"))?;

        match storage.get_item("edge_model")
            .map_err(|_| JsValue::from_str("Failed to load cached model"))? {
            Some(model_b64) => {
                let model_data = base64::decode(&model_b64)
                    .map_err(|_| JsValue::from_str("Failed to decode model"))?;
                Ok(Some(model_data))
            }
            None => Ok(None)
        }
    }
}

Model Update Mechanism

Over-the-air updates for edge models:

class EdgeModelManager:
    """Manage model updates for edge devices"""

    def __init__(
        self,
        update_server_url: str,
        device_id: str,
        current_version: str
    ):
        self.update_server_url = update_server_url
        self.device_id = device_id
        self.current_version = current_version

    async def check_for_updates(self) -> Optional[Dict]:
        """Check if newer model version available"""

        response = await self.http_client.get(
            f"{self.update_server_url}/models/latest",
            params={
                'device_id': self.device_id,
                'current_version': self.current_version,
                'platform': self._get_platform()
            }
        )

        if response['has_update']:
            return {
                'version': response['version'],
                'url': response['download_url'],
                'size': response['size_bytes'],
                'changelog': response['changelog'],
                'mandatory': response.get('mandatory', False)
            }

        return None

    async def download_update(
        self,
        update_info: Dict,
        progress_callback: Optional[Callable] = None
    ) -> str:
        """Download model update with resume support"""

        local_path = f"models/{update_info['version']}.onnx"

        # Check if partial download exists
        existing_size = 0
        if os.path.exists(local_path):
            existing_size = os.path.getsize(local_path)

        # Resume download if possible
        headers = {}
        if existing_size > 0:
            headers['Range'] = f'bytes={existing_size}-'

        async with self.http_client.stream(
            'GET',
            update_info['url'],
            headers=headers
        ) as response:
            mode = 'ab' if existing_size > 0 else 'wb'

            with open(local_path, mode) as f:
                downloaded = existing_size

                async for chunk in response.aiter_bytes(chunk_size=8192):
                    f.write(chunk)
                    downloaded += len(chunk)

                    if progress_callback:
                        progress = downloaded / update_info['size']
                        await progress_callback(progress)

        return local_path

    async def verify_update(self, model_path: str, expected_hash: str) -> bool:
        """Verify downloaded model integrity"""

        import hashlib

        sha256 = hashlib.sha256()

        with open(model_path, 'rb') as f:
            while chunk := f.read(8192):
                sha256.update(chunk)

        return sha256.hexdigest() == expected_hash

    async def apply_update(
        self,
        model_path: str,
        safe_mode: bool = True
    ) -> Dict:
        """Apply model update with rollback capability"""

        if safe_mode:
            # Backup current model
            backup_path = await self._backup_current_model()

        try:
            # Test new model
            validation_results = await self._validate_new_model(model_path)

            if not validation_results['passed']:
                raise ValueError(f"Model validation failed: {validation_results}")

            # Replace current model
            await self._replace_model(model_path)

            # Update version
            self.current_version = self._extract_version(model_path)

            return {
                'success': True,
                'version': self.current_version,
                'validation': validation_results
            }

        except Exception as e:
            if safe_mode and backup_path:
                # Rollback
                await self._restore_backup(backup_path)

            return {
                'success': False,
                'error': str(e),
                'rolled_back': safe_mode
            }

    async def _validate_new_model(self, model_path: str) -> Dict:
        """Validate new model before deployment"""

        # Load model
        runtime = EdgeRuntime(model_path)

        # Test inference
        test_inputs = self._generate_test_inputs()
        results = []

        for test_input in test_inputs:
            try:
                output = await runtime.infer(test_input['data'])

                # Validate output shape and range
                is_valid = self._validate_output(
                    output,
                    test_input['expected_shape'],
                    test_input['expected_range']
                )

                results.append(is_valid)

            except Exception as e:
                results.append(False)

        return {
            'passed': all(results),
            'test_results': results
        }


class AdaptiveInference:
    """Adapt inference strategy based on device capabilities"""

    def __init__(self):
        self.device_profile = self._profile_device()

    def _profile_device(self) -> Dict:
        """Profile device capabilities"""

        return {
            'cpu_cores': os.cpu_count(),
            'memory_available': self._get_available_memory(),
            'has_gpu': self._check_gpu_availability(),
            'battery_powered': self._is_battery_powered(),
            'network_quality': self._assess_network()
        }

    async def infer(
        self,
        model: EdgeRuntime,
        input_data: Any,
        priority: str = 'balanced'
    ) -> Any:
        """Adaptive inference with resource awareness"""

        # Select strategy based on device state
        if self._is_low_battery():
            # Energy-efficient mode
            strategy = 'lightweight'

        elif self._is_thermal_throttling():
            # Reduce load
            strategy = 'throttled'

        else:
            # Normal operation
            strategy = priority

        # Apply strategy
        if strategy == 'lightweight':
            # Reduce precision or skip optional processing
            return await self._lightweight_inference(model, input_data)

        elif strategy == 'throttled':
            # Batch or delay inference
            return await self._throttled_inference(model, input_data)

        else:
            # Full quality inference
            return await model.infer(input_data)

    def _is_low_battery(self) -> bool:
        """Check if device battery is low"""
        # Platform-specific battery check
        return False

    def _is_thermal_throttling(self) -> bool:
        """Check if device is thermal throttling"""
        # Platform-specific thermal check
        return False

Conclusion

Edge AI deployment requires careful optimization, efficient runtimes, and robust update mechanisms. WebAssembly provides a universal deployment target, while platform-specific optimizations maximize performance.

The key is balancing model quality with device constraints—memory, compute, and battery. As edge devices become more powerful and models more efficient, we’ll see increasingly sophisticated AI running entirely on-device.

Why Edge AI?

Model Optimization Pipeline

Edge Runtime Architecture

Model Update Mechanism

Conclusion

Related Posts

Real-Time AI Inference: Latency Optimization at Scale

Reasoning AI at Scale: Production Deployment Patterns

Distributed AI Training: Scaling Model Development