"""Generate realistic mock data from OpenAPI/JSON schemas.""" import re from datetime import datetime, date from typing import Any, Dict, List, Optional, Union from faker import Faker fake = Faker() class DataGenerationError(Exception): """Base exception for data generation errors.""" pass class UnsupportedTypeError(DataGenerationError): """Raised when an unsupported schema type is encountered.""" pass class FakerMapping: """Maps schema types and formats to Faker providers.""" FORMAT_MAPPINGS = { 'email': 'email', 'date-time': 'date_time', 'date': 'date', 'time': 'time', 'uri': 'uri', 'url': 'url', 'uuid': 'uuid4', 'hostname': 'hostname', 'ipv4': 'ipv4', 'ipv6': 'ipv6', 'phone-number': 'phone_number', 'binary': 'binary', 'byte': 'binary', 'password': 'password', } TYPE_MAPPINGS = { 'string': 'word', 'integer': 'random_int', 'number': 'pyfloat', 'boolean': 'boolean', 'array': 'pyint', 'object': 'pyint', 'null': 'none', } @classmethod def get_faker_method(cls, schema: Dict[str, Any]) -> str: """Get the appropriate Faker method for a schema. Args: schema: JSON schema dictionary. Returns: Name of the Faker method to use. """ if 'format' in schema and schema['format'] in cls.FORMAT_MAPPINGS: return cls.FORMAT_MAPPINGS[schema['format']] schema_type = schema.get('type', 'string') return cls.TYPE_MAPPINGS.get(schema_type, 'word') class DataGenerator: """Generate mock data based on JSON Schema definitions.""" def __init__(self, locale: str = 'en_US'): """Initialize the data generator. Args: locale: Faker locale for localized data generation. """ self.fake = Faker(locale) self._ref_cache: Dict[str, Any] = {} def reset_cache(self) -> None: """Clear the reference cache.""" self._ref_cache = {} def set_ref_cache(self, schemas: Dict[str, Any]) -> None: """Set the schema references cache. Args: schemas: Dictionary of schema definitions. """ self._ref_cache = schemas def generate(self, schema: Dict[str, Any]) -> Any: """Generate mock data from a JSON schema. Args: schema: JSON schema dictionary. Returns: Generated mock data. """ return self.generate_from_schema(schema) def generate_from_schema(self, schema: Dict[str, Any]) -> Any: """Generate mock data from a JSON schema. Args: schema: JSON schema dictionary. Returns: Generated mock data. """ if schema is None: return None if isinstance(schema, list): if not schema: return [] return [self.generate_from_schema(schema[0])] if not isinstance(schema, dict): return schema if '$ref' in schema: return self._resolve_ref(schema['$ref']) schema_type = schema.get('type', 'object') if schema_type == 'null' or schema.get('nullable', False): if schema.get('nullable', False) and self.fake.random.random() < 0.1: return None if schema_type == 'object': return self._generate_object(schema) if schema_type == 'array': return self._generate_array(schema) if schema_type == 'string': return self._generate_string(schema) if schema_type == 'integer': return self._generate_integer(schema) if schema_type == 'number': return self._generate_number(schema) if schema_type == 'boolean': return self._generate_boolean(schema) raise UnsupportedTypeError(f"Unsupported schema type: {schema_type}") def _resolve_ref(self, ref: str) -> Any: """Resolve a $ref to its schema definition. Args: ref: Reference string like '#/components/schemas/User'. Returns: Resolved schema or None if not found. """ if ref in self._ref_cache: return self.generate_from_schema(self._ref_cache[ref]) parts = ref.lstrip('#/').split('/') if len(parts) < 2: return None if parts[0] == 'components' and parts[1] == 'schemas': schema_name = parts[2] if len(parts) > 2 else None if schema_name and schema_name in self._ref_cache: return self.generate_from_schema(self._ref_cache[schema_name]) return None def _generate_object(self, schema: Dict[str, Any]) -> Dict[str, Any]: """Generate a mock object from an object schema. Args: schema: Object schema dictionary. Returns: Generated object dictionary. """ result: Dict[str, Any] = {} properties = schema.get('properties', {}) required_props = schema.get('required', []) additional_properties = schema.get('additionalProperties', True) for prop_name, prop_schema in properties.items(): if prop_name in required_props or self.fake.random.random() < 0.8: result[prop_name] = self.generate_from_schema(prop_schema) if additional_properties is True: num_extra = self.fake.random.randint(0, 3) for _ in range(num_extra): prop_name = self.fake.word() if prop_name not in result: result[prop_name] = self._generate_from_additional() return result def _generate_from_additional(self) -> Any: """Generate data for additional properties. Returns: Random mock data. """ schema_type = self.fake.random.choice(['string', 'integer', 'boolean']) if schema_type == 'string': return self.fake.word() elif schema_type == 'integer': return self.fake.random_int() else: return self.fake.boolean() def _generate_array(self, schema: Dict[str, Any]) -> List[Any]: """Generate a mock array from an array schema. Args: schema: Array schema dictionary. Returns: Generated array list. """ items_schema = schema.get('items', {}) min_items = schema.get('minItems', 1) max_items = schema.get('maxItems', 5) num_items = self.fake.random.randint(min_items, max_items) return [self.generate_from_schema(items_schema) for _ in range(num_items)] def _generate_string(self, schema: Dict[str, Any]) -> str: """Generate a mock string from a string schema. Args: schema: String schema dictionary. Returns: Generated string. """ format_type = schema.get('format', '') pattern = schema.get('pattern') enum_values = schema.get('enum') if enum_values: return self.fake.random.choice(enum_values) faker_method = FakerMapping.get_faker_method(schema) if hasattr(self.fake, faker_method): method = getattr(self.fake, faker_method) result = method() if not isinstance(result, str): result = str(result) return result min_length = schema.get('minLength', 1) max_length = schema.get('maxLength', 50) result = self.fake.word() while len(result) < min_length: result += self.fake.word() if len(result) > max_length: result = result[:max_length] if pattern: result = self._generate_by_pattern(pattern) return result def _generate_by_pattern(self, pattern: str) -> str: """Generate a string matching a regex pattern. Args: pattern: Regular expression pattern. Returns: Generated string matching the pattern. """ try: if pattern.startswith('^'): pattern = pattern[1:] if pattern.endswith('$'): pattern = pattern[:-1] if '|' in pattern: options = pattern.split('|') return self.fake.random.choice(options) if pattern.isalnum(): return self.fake.lexify(pattern) if re.match(r'^[a-zA-Z0-9]+$', pattern): return self.fake.lexify(pattern) return self.fake.word() except Exception: return self.fake.word() def _generate_integer(self, schema: Dict[str, Any]) -> int: """Generate a mock integer from an integer schema. Args: schema: Integer schema dictionary. Returns: Generated integer. """ minimum = schema.get('minimum', 0) maximum = schema.get('maximum', 1000) exclusive_minimum = schema.get('exclusiveMinimum', False) exclusive_maximum = schema.get('exclusiveMaximum', False) if exclusive_minimum: minimum += 1 if exclusive_maximum: maximum -= 1 if 'multipleOf' in schema: multiple = schema['multipleOf'] value = self.fake.random.randint(minimum // multiple, maximum // multiple) return value * multiple return self.fake.random.randint(minimum, maximum) def _generate_number(self, schema: Dict[str, Any]) -> float: """Generate a mock number from a number schema. Args: schema: Number schema dictionary. Returns: Generated float. """ minimum = schema.get('minimum', 0.0) maximum = schema.get('maximum', 1000.0) precision = schema.get('precision', 2) result = self.fake.pyfloat(left_digits=3, right_digits=precision) return max(minimum, min(result, maximum)) def _generate_boolean(self, schema: Dict[str, Any]) -> bool: """Generate a mock boolean from a boolean schema. Args: schema: Boolean schema dictionary. Returns: Generated boolean. """ del schema return self.fake.boolean() def generate_mock_data( schema: Dict[str, Any], schemas: Optional[Dict[str, Any]] = None, locale: str = 'en_US' ) -> Any: """Convenience function to generate mock data from a schema. Args: schema: JSON schema dictionary. schemas: Optional dictionary of referenced schemas. locale: Faker locale for localized data. Returns: Generated mock data. """ generator = DataGenerator(locale) if schemas: generator.set_ref_cache(schemas) return generator.generate(schema)