Module pychnosz.data.obigt
OBIGT database access module.
This module provides a high-level interface to the OBIGT (Oelkers, Benezeth, and Isobaric Gas Thermodynamics) database, which contains thermodynamic parameters for chemical species.
Functions
def get_default_obigt() ‑> OBIGTDatabase-
Expand source code
def get_default_obigt() -> OBIGTDatabase: """ Get a default OBIGT database instance. Returns: -------- OBIGTDatabase Default OBIGT database instance """ return OBIGTDatabase()Get a default OBIGT database instance.
Returns:
OBIGTDatabase Default OBIGT database instance
Classes
class OBIGTDatabase (data_loader: DataLoader | None = None)-
Expand source code
class OBIGTDatabase: """ High-level interface to the OBIGT thermodynamic database. This class provides methods to access, search, and manipulate the thermodynamic data from the OBIGT database files. """ def __init__(self, data_loader: Optional[DataLoader] = None): """ Initialize the OBIGT database. Parameters: ----------- data_loader : DataLoader, optional DataLoader instance to use. If None, creates a default loader. """ if data_loader is None: from .loader import get_default_loader self.loader = get_default_loader() else: self.loader = data_loader # Cache for combined data self._combined_data = None self._species_index = None # Define the expected columns for OBIGT data self.obigt_columns = [ 'name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'model', 'E_units', 'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T' ] # State classifications self.aqueous_states = ['aq'] self.crystalline_states = ['cr'] self.gas_states = ['gas'] self.liquid_states = ['liq'] def load_all_data(self, force_reload: bool = False) -> pd.DataFrame: """ Load and combine all OBIGT data files. Parameters: ----------- force_reload : bool, default False Force reloading of data even if cached Returns: -------- pd.DataFrame Combined OBIGT database """ if self._combined_data is not None and not force_reload: return self._combined_data.copy() # Load all OBIGT files obigt_files = self.loader.load_all_obigt_files() # Combine all files combined_data = [] for filename, df in obigt_files.items(): # Add source file information df_copy = df.copy() df_copy['source_file'] = filename combined_data.append(df_copy) # Concatenate all data self._combined_data = pd.concat(combined_data, ignore_index=True) # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index # to match R's row numbers. Row 0 in pandas should be row 1 in R. self._combined_data.index = self._combined_data.index + 1 # Create species index for fast lookups self._create_species_index() return self._combined_data.copy() def get_combined_data(self) -> pd.DataFrame: """ Get combined OBIGT thermodynamic data. Returns ------- pd.DataFrame Combined OBIGT data with all species """ if self._combined_data is not None: return self._combined_data.copy() try: # Try to load data normally first return self.load_all_data() except Exception as e: print(f"Warning: Could not load OBIGT data: {e}") # Create minimal fallback data for essential species return self._create_fallback_data() def _create_fallback_data(self) -> pd.DataFrame: """Create minimal fallback data for essential species.""" # Essential species data (approximate values for basic functionality) fallback_data = { 'name': ['water', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'], 'abbrv': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'], 'formula': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'], 'state': ['liq', 'aq', 'aq', 'aq', 'aq', 'aq'], 'G': [-56688.1, 0.0, -37595.0, -92307.0, -140314.0, -126172.0], 'H': [-68317.0, 0.0, -54977.0, -98900.0, -165180.0, -161963.0], 'S': [16.712, 0.0, -2.56, -39.75, 98.4, -50.0], 'Cp': [18.0, 0.0, -36.4, 37.11, 25.0, -53.1], 'V': [18.068, 0.0, -4.71, 34.0, 25.0, -6.0], 'z.T': [0, 1, -1, 0, -1, -2], 'ref1': ['', '', '', '', '', ''], 'ref2': ['', '', '', '', '', ''], 'date': ['', '', '', '', '', ''], 'model': ['', '', '', '', '', ''], 'E_units': ['', '', '', '', '', ''], 'a1.a': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'a2.b': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'a3.c': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'a4.d': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'c1.e': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'c2.f': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'omega.lambda': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] } df = pd.DataFrame(fallback_data) # Cache the fallback data self._combined_data = df self._create_species_index() return df.copy() def _create_species_index(self): """Create an index for fast species lookups.""" if self._combined_data is None: return # Create multi-level index for name, formula, and state self._species_index = {} for idx, row in self._combined_data.iterrows(): name = str(row.get('name', '')).strip() formula = str(row.get('formula', '')).strip() state = str(row.get('state', '')).strip() # Index by name if name and name not in self._species_index: self._species_index[name] = [] if name: self._species_index[name].append(idx) # Index by formula formula_key = f"formula:{formula}" if formula and formula_key not in self._species_index: self._species_index[formula_key] = [] if formula: self._species_index[formula_key].append(idx) # Index by name+state combination name_state_key = f"{name}({state})" if name and state and name_state_key not in self._species_index: self._species_index[name_state_key] = [] if name and state: self._species_index[name_state_key].append(idx) def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame: """ Get species data by name, formula, or identifier. Parameters: ----------- identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() results = [] # Try exact name match first if identifier in self._species_index: indices = self._species_index[identifier] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try formula match formula_key = f"formula:{identifier}" if formula_key in self._species_index: indices = self._species_index[formula_key] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try name+state combination if state: name_state_key = f"{identifier}({state})" if name_state_key in self._species_index: indices = self._species_index[name_state_key] for idx in indices: results.append(self._combined_data.iloc[idx]) # If no exact matches, try partial matching if not results: mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \ self._combined_data['formula'].str.contains(identifier, case=False, na=False) if state: mask &= (self._combined_data['state'] == state) partial_matches = self._combined_data[mask] results = [row for _, row in partial_matches.iterrows()] if results: return pd.DataFrame(results).reset_index(drop=True) else: return pd.DataFrame(columns=self._combined_data.columns) def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame: """ Search for species using a text query. Parameters: ----------- query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv'] Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() if search_columns is None: search_columns = ['name', 'formula', 'abbrv'] # Create search mask mask = pd.Series([False] * len(self._combined_data)) for col in search_columns: if col in self._combined_data.columns: mask |= self._combined_data[col].str.contains(query, case=False, na=False) return self._combined_data[mask].reset_index(drop=True) def get_species_by_state(self, state: str) -> pd.DataFrame: """ Get all species in a specific physical state. Parameters: ----------- state : str Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Species data for the specified state """ if self._combined_data is None: self.load_all_data() mask = self._combined_data['state'] == state return self._combined_data[mask].reset_index(drop=True) def get_aqueous_species(self) -> pd.DataFrame: """Get all aqueous species.""" return self.get_species_by_state('aq') def get_crystalline_species(self) -> pd.DataFrame: """Get all crystalline species.""" return self.get_species_by_state('cr') def get_gas_species(self) -> pd.DataFrame: """Get all gas species.""" return self.get_species_by_state('gas') def get_liquid_species(self) -> pd.DataFrame: """Get all liquid species.""" return self.get_species_by_state('liq') def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame: """ Get species containing specific elements. Parameters: ----------- elements : List[str] List of element symbols Returns: -------- pd.DataFrame Species containing the specified elements """ if self._combined_data is None: self.load_all_data() # Create search pattern for elements pattern = '|'.join(elements) mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False) return self._combined_data[mask].reset_index(drop=True) def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame: """ Extract thermodynamic properties from species data. Parameters: ----------- species_data : pd.DataFrame Species data from get_species or similar methods Returns: -------- pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.) """ thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T'] available_columns = [col for col in thermo_columns if col in species_data.columns] result = species_data[['name', 'formula', 'state'] + available_columns].copy() # Convert numeric columns to proper numeric types for col in available_columns: result[col] = pd.to_numeric(result[col], errors='coerce') return result def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]: """ Get statistics about the database. Returns: -------- Dict Database statistics including total species, states, etc. """ if self._combined_data is None: self.load_all_data() stats = { 'total_species': len(self._combined_data), 'states': self._combined_data['state'].value_counts().to_dict(), 'source_files': self._combined_data['source_file'].value_counts().to_dict(), 'unique_names': self._combined_data['name'].nunique(), 'unique_formulas': self._combined_data['formula'].nunique(), } return stats def validate_data(self) -> Dict[str, List]: """ Validate the OBIGT database for common issues. Returns: -------- Dict Validation results with issues found """ if self._combined_data is None: self.load_all_data() issues = { 'missing_names': [], 'missing_formulas': [], 'missing_states': [], 'invalid_numeric_values': [], 'duplicate_entries': [] } # Check for missing critical fields missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '') if missing_names.any(): issues['missing_names'] = self._combined_data[missing_names].index.tolist() missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '') if missing_formulas.any(): issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist() missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '') if missing_states.any(): issues['missing_states'] = self._combined_data[missing_states].index.tolist() # Check for invalid numeric values in key thermodynamic properties numeric_columns = ['G', 'H', 'S', 'Cp'] for col in numeric_columns: if col in self._combined_data.columns: numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce') invalid_mask = numeric_data.isna() & self._combined_data[col].notna() if invalid_mask.any(): issues['invalid_numeric_values'].extend( [(idx, col) for idx in self._combined_data[invalid_mask].index] ) # Check for potential duplicates duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False) if duplicate_mask.any(): issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist() return issues def export_to_csv(self, filename: str, species_filter: Optional[str] = None): """ Export database or filtered data to CSV. Parameters: ----------- filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.) """ if self._combined_data is None: self.load_all_data() data_to_export = self._combined_data if species_filter: if species_filter in ['aq', 'cr', 'gas', 'liq']: data_to_export = self.get_species_by_state(species_filter) data_to_export.to_csv(filename, index=False)High-level interface to the OBIGT thermodynamic database.
This class provides methods to access, search, and manipulate the thermodynamic data from the OBIGT database files.
Initialize the OBIGT database.
Parameters:
data_loader : DataLoader, optional DataLoader instance to use. If None, creates a default loader.
Methods
def export_to_csv(self, filename: str, species_filter: str | None = None)-
Expand source code
def export_to_csv(self, filename: str, species_filter: Optional[str] = None): """ Export database or filtered data to CSV. Parameters: ----------- filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.) """ if self._combined_data is None: self.load_all_data() data_to_export = self._combined_data if species_filter: if species_filter in ['aq', 'cr', 'gas', 'liq']: data_to_export = self.get_species_by_state(species_filter) data_to_export.to_csv(filename, index=False)Export database or filtered data to CSV.
Parameters:
filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.)
def get_aqueous_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_aqueous_species(self) -> pd.DataFrame: """Get all aqueous species.""" return self.get_species_by_state('aq')Get all aqueous species.
def get_combined_data(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_combined_data(self) -> pd.DataFrame: """ Get combined OBIGT thermodynamic data. Returns ------- pd.DataFrame Combined OBIGT data with all species """ if self._combined_data is not None: return self._combined_data.copy() try: # Try to load data normally first return self.load_all_data() except Exception as e: print(f"Warning: Could not load OBIGT data: {e}") # Create minimal fallback data for essential species return self._create_fallback_data()Get combined OBIGT thermodynamic data.
Returns
pd.DataFrame- Combined OBIGT data with all species
def get_crystalline_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_crystalline_species(self) -> pd.DataFrame: """Get all crystalline species.""" return self.get_species_by_state('cr')Get all crystalline species.
def get_database_stats(self) ‑> Dict[str, int | Dict[str, int]]-
Expand source code
def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]: """ Get statistics about the database. Returns: -------- Dict Database statistics including total species, states, etc. """ if self._combined_data is None: self.load_all_data() stats = { 'total_species': len(self._combined_data), 'states': self._combined_data['state'].value_counts().to_dict(), 'source_files': self._combined_data['source_file'].value_counts().to_dict(), 'unique_names': self._combined_data['name'].nunique(), 'unique_formulas': self._combined_data['formula'].nunique(), } return statsGet statistics about the database.
Returns:
Dict Database statistics including total species, states, etc.
def get_gas_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_gas_species(self) -> pd.DataFrame: """Get all gas species.""" return self.get_species_by_state('gas')Get all gas species.
def get_liquid_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_liquid_species(self) -> pd.DataFrame: """Get all liquid species.""" return self.get_species_by_state('liq')Get all liquid species.
def get_species(self, identifier: str, state: str | None = None) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame: """ Get species data by name, formula, or identifier. Parameters: ----------- identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() results = [] # Try exact name match first if identifier in self._species_index: indices = self._species_index[identifier] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try formula match formula_key = f"formula:{identifier}" if formula_key in self._species_index: indices = self._species_index[formula_key] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try name+state combination if state: name_state_key = f"{identifier}({state})" if name_state_key in self._species_index: indices = self._species_index[name_state_key] for idx in indices: results.append(self._combined_data.iloc[idx]) # If no exact matches, try partial matching if not results: mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \ self._combined_data['formula'].str.contains(identifier, case=False, na=False) if state: mask &= (self._combined_data['state'] == state) partial_matches = self._combined_data[mask] results = [row for _, row in partial_matches.iterrows()] if results: return pd.DataFrame(results).reset_index(drop=True) else: return pd.DataFrame(columns=self._combined_data.columns)Get species data by name, formula, or identifier.
Parameters:
identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq')
Returns:
pd.DataFrame Matching species data
def get_species_by_elements(self, elements: List[str]) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame: """ Get species containing specific elements. Parameters: ----------- elements : List[str] List of element symbols Returns: -------- pd.DataFrame Species containing the specified elements """ if self._combined_data is None: self.load_all_data() # Create search pattern for elements pattern = '|'.join(elements) mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False) return self._combined_data[mask].reset_index(drop=True)Get species containing specific elements.
Parameters:
elements : List[str] List of element symbols
Returns:
pd.DataFrame Species containing the specified elements
def get_species_by_state(self, state: str) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_species_by_state(self, state: str) -> pd.DataFrame: """ Get all species in a specific physical state. Parameters: ----------- state : str Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Species data for the specified state """ if self._combined_data is None: self.load_all_data() mask = self._combined_data['state'] == state return self._combined_data[mask].reset_index(drop=True)Get all species in a specific physical state.
Parameters:
state : str Physical state ('aq', 'cr', 'gas', 'liq')
Returns:
pd.DataFrame Species data for the specified state
def get_thermodynamic_properties(self, species_data: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame: """ Extract thermodynamic properties from species data. Parameters: ----------- species_data : pd.DataFrame Species data from get_species or similar methods Returns: -------- pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.) """ thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T'] available_columns = [col for col in thermo_columns if col in species_data.columns] result = species_data[['name', 'formula', 'state'] + available_columns].copy() # Convert numeric columns to proper numeric types for col in available_columns: result[col] = pd.to_numeric(result[col], errors='coerce') return resultExtract thermodynamic properties from species data.
Parameters:
species_data : pd.DataFrame Species data from get_species or similar methods
Returns:
pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.)
def load_all_data(self, force_reload: bool = False) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_all_data(self, force_reload: bool = False) -> pd.DataFrame: """ Load and combine all OBIGT data files. Parameters: ----------- force_reload : bool, default False Force reloading of data even if cached Returns: -------- pd.DataFrame Combined OBIGT database """ if self._combined_data is not None and not force_reload: return self._combined_data.copy() # Load all OBIGT files obigt_files = self.loader.load_all_obigt_files() # Combine all files combined_data = [] for filename, df in obigt_files.items(): # Add source file information df_copy = df.copy() df_copy['source_file'] = filename combined_data.append(df_copy) # Concatenate all data self._combined_data = pd.concat(combined_data, ignore_index=True) # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index # to match R's row numbers. Row 0 in pandas should be row 1 in R. self._combined_data.index = self._combined_data.index + 1 # Create species index for fast lookups self._create_species_index() return self._combined_data.copy()Load and combine all OBIGT data files.
Parameters:
force_reload : bool, default False Force reloading of data even if cached
Returns:
pd.DataFrame Combined OBIGT database
def search_species(self, query: str, search_columns: List[str] | None = None) ‑> pandas.core.frame.DataFrame-
Expand source code
def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame: """ Search for species using a text query. Parameters: ----------- query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv'] Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() if search_columns is None: search_columns = ['name', 'formula', 'abbrv'] # Create search mask mask = pd.Series([False] * len(self._combined_data)) for col in search_columns: if col in self._combined_data.columns: mask |= self._combined_data[col].str.contains(query, case=False, na=False) return self._combined_data[mask].reset_index(drop=True)Search for species using a text query.
Parameters:
query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv']
Returns:
pd.DataFrame Matching species data
def validate_data(self) ‑> Dict[str, List]-
Expand source code
def validate_data(self) -> Dict[str, List]: """ Validate the OBIGT database for common issues. Returns: -------- Dict Validation results with issues found """ if self._combined_data is None: self.load_all_data() issues = { 'missing_names': [], 'missing_formulas': [], 'missing_states': [], 'invalid_numeric_values': [], 'duplicate_entries': [] } # Check for missing critical fields missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '') if missing_names.any(): issues['missing_names'] = self._combined_data[missing_names].index.tolist() missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '') if missing_formulas.any(): issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist() missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '') if missing_states.any(): issues['missing_states'] = self._combined_data[missing_states].index.tolist() # Check for invalid numeric values in key thermodynamic properties numeric_columns = ['G', 'H', 'S', 'Cp'] for col in numeric_columns: if col in self._combined_data.columns: numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce') invalid_mask = numeric_data.isna() & self._combined_data[col].notna() if invalid_mask.any(): issues['invalid_numeric_values'].extend( [(idx, col) for idx in self._combined_data[invalid_mask].index] ) # Check for potential duplicates duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False) if duplicate_mask.any(): issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist() return issuesValidate the OBIGT database for common issues.
Returns:
Dict Validation results with issues found