Module pychnosz.data
Data management and access for CHNOSZ thermodynamic database.
Sub-modules
pychnosz.data.add_obigt-
Implementation of add_OBIGT() function for Python CHNOSZ …
pychnosz.data.loader-
Data loader module for CHNOSZ thermodynamic database files …
pychnosz.data.mod_obigt-
Implementation of mod_OBIGT() function for Python CHNOSZ …
pychnosz.data.obigt-
OBIGT database access module …
pychnosz.data.worm-
WORM database loader for CHNOSZ …
Functions
def get_default_loader() ‑> DataLoader-
Expand source code
def get_default_loader() -> DataLoader: """ Get a default DataLoader instance. Returns: -------- DataLoader Default DataLoader instance """ return DataLoader()Get a default DataLoader instance.
Returns:
DataLoader Default DataLoader instance
def get_default_obigt() ‑> OBIGTDatabase-
Expand source code
def get_default_obigt() -> OBIGTDatabase: """ Get a default OBIGT database instance. Returns: -------- OBIGTDatabase Default OBIGT database instance """ return OBIGTDatabase()Get a default OBIGT database instance.
Returns:
OBIGTDatabase Default OBIGT database instance
Classes
class DataLoader (data_path: str | pathlib.Path | None = None)-
Expand source code
class DataLoader: """ Main data loader class for CHNOSZ thermodynamic database files. This class handles loading of various data files from the CHNOSZ R package, including compressed files, and converts them to pandas DataFrames while preserving data integrity. """ def __init__(self, data_path: Optional[Union[str, Path]] = None): """ Initialize the DataLoader. Parameters: ----------- data_path : str or Path, optional Path to the CHNOSZ data directory. If None, will attempt to find the data/extdata directory relative to this file within the package. """ if data_path is None: # Try to find the data directory relative to this file # We're now in pychnosz/data/, so extdata is in the same directory current_dir = Path(__file__).parent self.data_path = current_dir / "extdata" else: self.data_path = Path(data_path) if not self.data_path.exists(): raise FileNotFoundError(f"Data directory not found: {self.data_path}") self.obigt_path = self.data_path / "OBIGT" self.thermo_path = self.data_path / "thermo" # Cache for loaded data self._cache = {} def _read_csv_safe(self, filepath: Path, **kwargs) -> pd.DataFrame: """ Safely read a CSV file with appropriate error handling. Parameters: ----------- filepath : Path Path to the CSV file **kwargs Additional arguments to pass to pd.read_csv Returns: -------- pd.DataFrame Loaded DataFrame """ try: # Handle potential encoding issues encodings = ['utf-8', 'latin-1', 'cp1252'] for encoding in encodings: try: df = pd.read_csv(filepath, encoding=encoding, **kwargs) return df except UnicodeDecodeError: continue # If all encodings fail, try with error handling df = pd.read_csv(filepath, encoding='utf-8', errors='replace', **kwargs) warnings.warn(f"Used error replacement for file {filepath}") return df except Exception as e: raise IOError(f"Failed to read {filepath}: {str(e)}") def _read_compressed_csv(self, filepath: Path, **kwargs) -> pd.DataFrame: """ Read a compressed CSV file (e.g., .xz format). Parameters: ----------- filepath : Path Path to the compressed CSV file **kwargs Additional arguments to pass to pd.read_csv Returns: -------- pd.DataFrame Loaded DataFrame """ if filepath.suffix == '.xz': with lzma.open(filepath, 'rt', encoding='utf-8') as f: df = pd.read_csv(f, **kwargs) return df else: raise ValueError(f"Unsupported compression format: {filepath.suffix}") def load_obigt_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame: """ Load a specific OBIGT database file. Parameters: ----------- filename : str Name of the OBIGT file to load (e.g., 'inorganic_aq.csv') use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Loaded OBIGT data """ cache_key = f"obigt_{filename}" if use_cache and cache_key in self._cache: return self._cache[cache_key].copy() filepath = self.obigt_path / filename if not filepath.exists(): raise FileNotFoundError(f"OBIGT file not found: {filepath}") # Load the data df = self._read_csv_safe(filepath) # Clean up column names (remove any whitespace) df.columns = df.columns.str.strip() # Cache the result if use_cache: self._cache[cache_key] = df.copy() return df def load_all_obigt_files(self, use_cache: bool = True) -> Dict[str, pd.DataFrame]: """ Load all OBIGT database files in the same order as R CHNOSZ. This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function to ensure identical species indices between R and Python versions. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- Dict[str, pd.DataFrame] Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ """ obigt_files = {} if not self.obigt_path.exists(): raise FileNotFoundError(f"OBIGT directory not found: {self.obigt_path}") # Use exact same order as R CHNOSZ (from thermo.R lines 63-67) # sources_aq <- paste0(c("H2O", "inorganic", "organic"), "_aq") # sources_cr <- paste0(c("Berman", "inorganic", "organic"), "_cr") # sources_liq <- paste0(c("organic"), "_liq") # sources_gas <- paste0(c("inorganic", "organic"), "_gas") # sources <- c(sources_aq, sources_cr, sources_gas, sources_liq) r_chnosz_order = [ "H2O_aq.csv", "inorganic_aq.csv", "organic_aq.csv", "Berman_cr.csv", "inorganic_cr.csv", "organic_cr.csv", "inorganic_gas.csv", "organic_gas.csv", "organic_liq.csv" ] # Load files in R CHNOSZ order for filename in r_chnosz_order: file_path = self.obigt_path / filename if file_path.exists(): obigt_files[filename] = self.load_obigt_file(filename, use_cache=use_cache) else: warnings.warn(f"OBIGT file not found: {filename}") return obigt_files def load_thermo_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame: """ Load a specific thermo database file. Parameters: ----------- filename : str Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz') use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Loaded thermo data """ cache_key = f"thermo_{filename}" if use_cache and cache_key in self._cache: return self._cache[cache_key].copy() filepath = self.thermo_path / filename if not filepath.exists(): raise FileNotFoundError(f"Thermo file not found: {filepath}") # Handle compressed files if filepath.suffix == '.xz': df = self._read_compressed_csv(filepath) else: df = self._read_csv_safe(filepath) # Clean up column names df.columns = df.columns.str.strip() # Cache the result if use_cache: self._cache[cache_key] = df.copy() return df def load_elements(self, use_cache: bool = True) -> pd.DataFrame: """ Load the elements data file. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Elements data with columns: element, state, source, mass, s, n """ return self.load_thermo_file('element.csv', use_cache=use_cache) def load_buffer(self, use_cache: bool = True) -> pd.DataFrame: """ Load the buffer data file. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Buffer data with columns: name, species, state, logact """ return self.load_thermo_file('buffer.csv', use_cache=use_cache) def load_protein(self, use_cache: bool = True) -> pd.DataFrame: """ Load the protein data file. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Protein data with amino acid compositions """ return self.load_thermo_file('protein.csv', use_cache=use_cache) def load_stoich(self, use_cache: bool = True) -> pd.DataFrame: """ Load the stoichiometry data file (compressed). Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Stoichiometry matrix for all species """ return self.load_thermo_file('stoich.csv.xz', use_cache=use_cache) def get_available_obigt_files(self) -> List[str]: """ Get list of available OBIGT files. Returns: -------- List[str] List of available OBIGT filenames """ if not self.obigt_path.exists(): return [] return [f.name for f in self.obigt_path.glob("*.csv")] def get_available_thermo_files(self) -> List[str]: """ Get list of available thermo files. Returns: -------- List[str] List of available thermo filenames """ if not self.thermo_path.exists(): return [] # Get both .csv and .csv.xz files csv_files = [f.name for f in self.thermo_path.glob("*.csv")] xz_files = [f.name for f in self.thermo_path.glob("*.csv.xz")] return sorted(csv_files + xz_files) def clear_cache(self): """Clear all cached data.""" self._cache.clear() def get_cache_info(self) -> Dict[str, int]: """ Get information about cached data. Returns: -------- Dict[str, int] Dictionary with cache keys and DataFrame sizes """ return {key: len(df) for key, df in self._cache.items()} def get_data_path(self) -> Path: """ Get the data directory path. Returns ------- Path Path to the data directory """ return self.data_path def load_buffers(self, use_cache: bool = True) -> pd.DataFrame: """ Load buffer data (alias for load_buffer for compatibility). Parameters ---------- use_cache : bool, default True Whether to use cached data if available Returns ------- pd.DataFrame Buffer data """ try: return self.load_buffer(use_cache=use_cache) except Exception: # Return empty DataFrame if buffer data not available return pd.DataFrame(columns=['name', 'species', 'state', 'logact']) def load_proteins(self, use_cache: bool = True) -> pd.DataFrame: """ Load protein data (alias for load_protein for compatibility). Parameters ---------- use_cache : bool, default True Whether to use cached data if available Returns ------- pd.DataFrame Protein data """ try: return self.load_protein(use_cache=use_cache) except Exception: # Return empty DataFrame if protein data not available return pd.DataFrame(columns=['protein', 'organism', 'ref', 'abbrv', 'chains']) def load_refs(self, use_cache: bool = True) -> pd.DataFrame: """ Load references data file. Parameters ---------- use_cache : bool, default True Whether to use cached data if available Returns ------- pd.DataFrame References data """ try: return self.load_thermo_file('refs.csv', use_cache=use_cache) except Exception: # Return empty DataFrame if refs data not available return pd.DataFrame(columns=['key', 'author', 'year', 'citation'])Main data loader class for CHNOSZ thermodynamic database files.
This class handles loading of various data files from the CHNOSZ R package, including compressed files, and converts them to pandas DataFrames while preserving data integrity.
Initialize the DataLoader.
Parameters:
data_path : str or Path, optional Path to the CHNOSZ data directory. If None, will attempt to find the data/extdata directory relative to this file within the package.
Methods
def clear_cache(self)-
Expand source code
def clear_cache(self): """Clear all cached data.""" self._cache.clear()Clear all cached data.
def get_available_obigt_files(self) ‑> List[str]-
Expand source code
def get_available_obigt_files(self) -> List[str]: """ Get list of available OBIGT files. Returns: -------- List[str] List of available OBIGT filenames """ if not self.obigt_path.exists(): return [] return [f.name for f in self.obigt_path.glob("*.csv")]Get list of available OBIGT files.
Returns:
List[str] List of available OBIGT filenames
def get_available_thermo_files(self) ‑> List[str]-
Expand source code
def get_available_thermo_files(self) -> List[str]: """ Get list of available thermo files. Returns: -------- List[str] List of available thermo filenames """ if not self.thermo_path.exists(): return [] # Get both .csv and .csv.xz files csv_files = [f.name for f in self.thermo_path.glob("*.csv")] xz_files = [f.name for f in self.thermo_path.glob("*.csv.xz")] return sorted(csv_files + xz_files)Get list of available thermo files.
Returns:
List[str] List of available thermo filenames
def get_cache_info(self) ‑> Dict[str, int]-
Expand source code
def get_cache_info(self) -> Dict[str, int]: """ Get information about cached data. Returns: -------- Dict[str, int] Dictionary with cache keys and DataFrame sizes """ return {key: len(df) for key, df in self._cache.items()}Get information about cached data.
Returns:
Dict[str, int] Dictionary with cache keys and DataFrame sizes
def get_data_path(self) ‑> pathlib.Path-
Expand source code
def get_data_path(self) -> Path: """ Get the data directory path. Returns ------- Path Path to the data directory """ return self.data_pathGet the data directory path.
Returns
Path- Path to the data directory
def load_all_obigt_files(self, use_cache: bool = True) ‑> Dict[str, pandas.core.frame.DataFrame]-
Expand source code
def load_all_obigt_files(self, use_cache: bool = True) -> Dict[str, pd.DataFrame]: """ Load all OBIGT database files in the same order as R CHNOSZ. This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function to ensure identical species indices between R and Python versions. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- Dict[str, pd.DataFrame] Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ """ obigt_files = {} if not self.obigt_path.exists(): raise FileNotFoundError(f"OBIGT directory not found: {self.obigt_path}") # Use exact same order as R CHNOSZ (from thermo.R lines 63-67) # sources_aq <- paste0(c("H2O", "inorganic", "organic"), "_aq") # sources_cr <- paste0(c("Berman", "inorganic", "organic"), "_cr") # sources_liq <- paste0(c("organic"), "_liq") # sources_gas <- paste0(c("inorganic", "organic"), "_gas") # sources <- c(sources_aq, sources_cr, sources_gas, sources_liq) r_chnosz_order = [ "H2O_aq.csv", "inorganic_aq.csv", "organic_aq.csv", "Berman_cr.csv", "inorganic_cr.csv", "organic_cr.csv", "inorganic_gas.csv", "organic_gas.csv", "organic_liq.csv" ] # Load files in R CHNOSZ order for filename in r_chnosz_order: file_path = self.obigt_path / filename if file_path.exists(): obigt_files[filename] = self.load_obigt_file(filename, use_cache=use_cache) else: warnings.warn(f"OBIGT file not found: {filename}") return obigt_filesLoad all OBIGT database files in the same order as R CHNOSZ.
This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function to ensure identical species indices between R and Python versions.
Parameters:
use_cache : bool, default True Whether to use cached data if available
Returns:
Dict[str, pd.DataFrame] Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ
def load_buffer(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_buffer(self, use_cache: bool = True) -> pd.DataFrame: """ Load the buffer data file. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Buffer data with columns: name, species, state, logact """ return self.load_thermo_file('buffer.csv', use_cache=use_cache)Load the buffer data file.
Parameters:
use_cache : bool, default True Whether to use cached data if available
Returns:
pd.DataFrame Buffer data with columns: name, species, state, logact
def load_buffers(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_buffers(self, use_cache: bool = True) -> pd.DataFrame: """ Load buffer data (alias for load_buffer for compatibility). Parameters ---------- use_cache : bool, default True Whether to use cached data if available Returns ------- pd.DataFrame Buffer data """ try: return self.load_buffer(use_cache=use_cache) except Exception: # Return empty DataFrame if buffer data not available return pd.DataFrame(columns=['name', 'species', 'state', 'logact'])Load buffer data (alias for load_buffer for compatibility).
Parameters
use_cache:bool, defaultTrue- Whether to use cached data if available
Returns
pd.DataFrame- Buffer data
def load_elements(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_elements(self, use_cache: bool = True) -> pd.DataFrame: """ Load the elements data file. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Elements data with columns: element, state, source, mass, s, n """ return self.load_thermo_file('element.csv', use_cache=use_cache)Load the elements data file.
Parameters:
use_cache : bool, default True Whether to use cached data if available
Returns:
pd.DataFrame Elements data with columns: element, state, source, mass, s, n
def load_obigt_file(self, filename: str, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_obigt_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame: """ Load a specific OBIGT database file. Parameters: ----------- filename : str Name of the OBIGT file to load (e.g., 'inorganic_aq.csv') use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Loaded OBIGT data """ cache_key = f"obigt_{filename}" if use_cache and cache_key in self._cache: return self._cache[cache_key].copy() filepath = self.obigt_path / filename if not filepath.exists(): raise FileNotFoundError(f"OBIGT file not found: {filepath}") # Load the data df = self._read_csv_safe(filepath) # Clean up column names (remove any whitespace) df.columns = df.columns.str.strip() # Cache the result if use_cache: self._cache[cache_key] = df.copy() return dfLoad a specific OBIGT database file.
Parameters:
filename : str Name of the OBIGT file to load (e.g., 'inorganic_aq.csv') use_cache : bool, default True Whether to use cached data if available
Returns:
pd.DataFrame Loaded OBIGT data
def load_protein(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_protein(self, use_cache: bool = True) -> pd.DataFrame: """ Load the protein data file. Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Protein data with amino acid compositions """ return self.load_thermo_file('protein.csv', use_cache=use_cache)Load the protein data file.
Parameters:
use_cache : bool, default True Whether to use cached data if available
Returns:
pd.DataFrame Protein data with amino acid compositions
def load_proteins(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_proteins(self, use_cache: bool = True) -> pd.DataFrame: """ Load protein data (alias for load_protein for compatibility). Parameters ---------- use_cache : bool, default True Whether to use cached data if available Returns ------- pd.DataFrame Protein data """ try: return self.load_protein(use_cache=use_cache) except Exception: # Return empty DataFrame if protein data not available return pd.DataFrame(columns=['protein', 'organism', 'ref', 'abbrv', 'chains'])Load protein data (alias for load_protein for compatibility).
Parameters
use_cache:bool, defaultTrue- Whether to use cached data if available
Returns
pd.DataFrame- Protein data
def load_refs(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_refs(self, use_cache: bool = True) -> pd.DataFrame: """ Load references data file. Parameters ---------- use_cache : bool, default True Whether to use cached data if available Returns ------- pd.DataFrame References data """ try: return self.load_thermo_file('refs.csv', use_cache=use_cache) except Exception: # Return empty DataFrame if refs data not available return pd.DataFrame(columns=['key', 'author', 'year', 'citation'])Load references data file.
Parameters
use_cache:bool, defaultTrue- Whether to use cached data if available
Returns
pd.DataFrame- References data
def load_stoich(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_stoich(self, use_cache: bool = True) -> pd.DataFrame: """ Load the stoichiometry data file (compressed). Parameters: ----------- use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Stoichiometry matrix for all species """ return self.load_thermo_file('stoich.csv.xz', use_cache=use_cache)Load the stoichiometry data file (compressed).
Parameters:
use_cache : bool, default True Whether to use cached data if available
Returns:
pd.DataFrame Stoichiometry matrix for all species
def load_thermo_file(self, filename: str, use_cache: bool = True) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_thermo_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame: """ Load a specific thermo database file. Parameters: ----------- filename : str Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz') use_cache : bool, default True Whether to use cached data if available Returns: -------- pd.DataFrame Loaded thermo data """ cache_key = f"thermo_{filename}" if use_cache and cache_key in self._cache: return self._cache[cache_key].copy() filepath = self.thermo_path / filename if not filepath.exists(): raise FileNotFoundError(f"Thermo file not found: {filepath}") # Handle compressed files if filepath.suffix == '.xz': df = self._read_compressed_csv(filepath) else: df = self._read_csv_safe(filepath) # Clean up column names df.columns = df.columns.str.strip() # Cache the result if use_cache: self._cache[cache_key] = df.copy() return dfLoad a specific thermo database file.
Parameters:
filename : str Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz') use_cache : bool, default True Whether to use cached data if available
Returns:
pd.DataFrame Loaded thermo data
class OBIGTDatabase (data_loader: DataLoader | None = None)-
Expand source code
class OBIGTDatabase: """ High-level interface to the OBIGT thermodynamic database. This class provides methods to access, search, and manipulate the thermodynamic data from the OBIGT database files. """ def __init__(self, data_loader: Optional[DataLoader] = None): """ Initialize the OBIGT database. Parameters: ----------- data_loader : DataLoader, optional DataLoader instance to use. If None, creates a default loader. """ if data_loader is None: from .loader import get_default_loader self.loader = get_default_loader() else: self.loader = data_loader # Cache for combined data self._combined_data = None self._species_index = None # Define the expected columns for OBIGT data self.obigt_columns = [ 'name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'model', 'E_units', 'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T' ] # State classifications self.aqueous_states = ['aq'] self.crystalline_states = ['cr'] self.gas_states = ['gas'] self.liquid_states = ['liq'] def load_all_data(self, force_reload: bool = False) -> pd.DataFrame: """ Load and combine all OBIGT data files. Parameters: ----------- force_reload : bool, default False Force reloading of data even if cached Returns: -------- pd.DataFrame Combined OBIGT database """ if self._combined_data is not None and not force_reload: return self._combined_data.copy() # Load all OBIGT files obigt_files = self.loader.load_all_obigt_files() # Combine all files combined_data = [] for filename, df in obigt_files.items(): # Add source file information df_copy = df.copy() df_copy['source_file'] = filename combined_data.append(df_copy) # Concatenate all data self._combined_data = pd.concat(combined_data, ignore_index=True) # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index # to match R's row numbers. Row 0 in pandas should be row 1 in R. self._combined_data.index = self._combined_data.index + 1 # Create species index for fast lookups self._create_species_index() return self._combined_data.copy() def get_combined_data(self) -> pd.DataFrame: """ Get combined OBIGT thermodynamic data. Returns ------- pd.DataFrame Combined OBIGT data with all species """ if self._combined_data is not None: return self._combined_data.copy() try: # Try to load data normally first return self.load_all_data() except Exception as e: print(f"Warning: Could not load OBIGT data: {e}") # Create minimal fallback data for essential species return self._create_fallback_data() def _create_fallback_data(self) -> pd.DataFrame: """Create minimal fallback data for essential species.""" # Essential species data (approximate values for basic functionality) fallback_data = { 'name': ['water', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'], 'abbrv': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'], 'formula': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'], 'state': ['liq', 'aq', 'aq', 'aq', 'aq', 'aq'], 'G': [-56688.1, 0.0, -37595.0, -92307.0, -140314.0, -126172.0], 'H': [-68317.0, 0.0, -54977.0, -98900.0, -165180.0, -161963.0], 'S': [16.712, 0.0, -2.56, -39.75, 98.4, -50.0], 'Cp': [18.0, 0.0, -36.4, 37.11, 25.0, -53.1], 'V': [18.068, 0.0, -4.71, 34.0, 25.0, -6.0], 'z.T': [0, 1, -1, 0, -1, -2], 'ref1': ['', '', '', '', '', ''], 'ref2': ['', '', '', '', '', ''], 'date': ['', '', '', '', '', ''], 'model': ['', '', '', '', '', ''], 'E_units': ['', '', '', '', '', ''], 'a1.a': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'a2.b': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'a3.c': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'a4.d': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'c1.e': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'c2.f': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'omega.lambda': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] } df = pd.DataFrame(fallback_data) # Cache the fallback data self._combined_data = df self._create_species_index() return df.copy() def _create_species_index(self): """Create an index for fast species lookups.""" if self._combined_data is None: return # Create multi-level index for name, formula, and state self._species_index = {} for idx, row in self._combined_data.iterrows(): name = str(row.get('name', '')).strip() formula = str(row.get('formula', '')).strip() state = str(row.get('state', '')).strip() # Index by name if name and name not in self._species_index: self._species_index[name] = [] if name: self._species_index[name].append(idx) # Index by formula formula_key = f"formula:{formula}" if formula and formula_key not in self._species_index: self._species_index[formula_key] = [] if formula: self._species_index[formula_key].append(idx) # Index by name+state combination name_state_key = f"{name}({state})" if name and state and name_state_key not in self._species_index: self._species_index[name_state_key] = [] if name and state: self._species_index[name_state_key].append(idx) def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame: """ Get species data by name, formula, or identifier. Parameters: ----------- identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() results = [] # Try exact name match first if identifier in self._species_index: indices = self._species_index[identifier] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try formula match formula_key = f"formula:{identifier}" if formula_key in self._species_index: indices = self._species_index[formula_key] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try name+state combination if state: name_state_key = f"{identifier}({state})" if name_state_key in self._species_index: indices = self._species_index[name_state_key] for idx in indices: results.append(self._combined_data.iloc[idx]) # If no exact matches, try partial matching if not results: mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \ self._combined_data['formula'].str.contains(identifier, case=False, na=False) if state: mask &= (self._combined_data['state'] == state) partial_matches = self._combined_data[mask] results = [row for _, row in partial_matches.iterrows()] if results: return pd.DataFrame(results).reset_index(drop=True) else: return pd.DataFrame(columns=self._combined_data.columns) def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame: """ Search for species using a text query. Parameters: ----------- query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv'] Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() if search_columns is None: search_columns = ['name', 'formula', 'abbrv'] # Create search mask mask = pd.Series([False] * len(self._combined_data)) for col in search_columns: if col in self._combined_data.columns: mask |= self._combined_data[col].str.contains(query, case=False, na=False) return self._combined_data[mask].reset_index(drop=True) def get_species_by_state(self, state: str) -> pd.DataFrame: """ Get all species in a specific physical state. Parameters: ----------- state : str Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Species data for the specified state """ if self._combined_data is None: self.load_all_data() mask = self._combined_data['state'] == state return self._combined_data[mask].reset_index(drop=True) def get_aqueous_species(self) -> pd.DataFrame: """Get all aqueous species.""" return self.get_species_by_state('aq') def get_crystalline_species(self) -> pd.DataFrame: """Get all crystalline species.""" return self.get_species_by_state('cr') def get_gas_species(self) -> pd.DataFrame: """Get all gas species.""" return self.get_species_by_state('gas') def get_liquid_species(self) -> pd.DataFrame: """Get all liquid species.""" return self.get_species_by_state('liq') def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame: """ Get species containing specific elements. Parameters: ----------- elements : List[str] List of element symbols Returns: -------- pd.DataFrame Species containing the specified elements """ if self._combined_data is None: self.load_all_data() # Create search pattern for elements pattern = '|'.join(elements) mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False) return self._combined_data[mask].reset_index(drop=True) def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame: """ Extract thermodynamic properties from species data. Parameters: ----------- species_data : pd.DataFrame Species data from get_species or similar methods Returns: -------- pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.) """ thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T'] available_columns = [col for col in thermo_columns if col in species_data.columns] result = species_data[['name', 'formula', 'state'] + available_columns].copy() # Convert numeric columns to proper numeric types for col in available_columns: result[col] = pd.to_numeric(result[col], errors='coerce') return result def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]: """ Get statistics about the database. Returns: -------- Dict Database statistics including total species, states, etc. """ if self._combined_data is None: self.load_all_data() stats = { 'total_species': len(self._combined_data), 'states': self._combined_data['state'].value_counts().to_dict(), 'source_files': self._combined_data['source_file'].value_counts().to_dict(), 'unique_names': self._combined_data['name'].nunique(), 'unique_formulas': self._combined_data['formula'].nunique(), } return stats def validate_data(self) -> Dict[str, List]: """ Validate the OBIGT database for common issues. Returns: -------- Dict Validation results with issues found """ if self._combined_data is None: self.load_all_data() issues = { 'missing_names': [], 'missing_formulas': [], 'missing_states': [], 'invalid_numeric_values': [], 'duplicate_entries': [] } # Check for missing critical fields missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '') if missing_names.any(): issues['missing_names'] = self._combined_data[missing_names].index.tolist() missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '') if missing_formulas.any(): issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist() missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '') if missing_states.any(): issues['missing_states'] = self._combined_data[missing_states].index.tolist() # Check for invalid numeric values in key thermodynamic properties numeric_columns = ['G', 'H', 'S', 'Cp'] for col in numeric_columns: if col in self._combined_data.columns: numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce') invalid_mask = numeric_data.isna() & self._combined_data[col].notna() if invalid_mask.any(): issues['invalid_numeric_values'].extend( [(idx, col) for idx in self._combined_data[invalid_mask].index] ) # Check for potential duplicates duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False) if duplicate_mask.any(): issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist() return issues def export_to_csv(self, filename: str, species_filter: Optional[str] = None): """ Export database or filtered data to CSV. Parameters: ----------- filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.) """ if self._combined_data is None: self.load_all_data() data_to_export = self._combined_data if species_filter: if species_filter in ['aq', 'cr', 'gas', 'liq']: data_to_export = self.get_species_by_state(species_filter) data_to_export.to_csv(filename, index=False)High-level interface to the OBIGT thermodynamic database.
This class provides methods to access, search, and manipulate the thermodynamic data from the OBIGT database files.
Initialize the OBIGT database.
Parameters:
data_loader : DataLoader, optional DataLoader instance to use. If None, creates a default loader.
Methods
def export_to_csv(self, filename: str, species_filter: str | None = None)-
Expand source code
def export_to_csv(self, filename: str, species_filter: Optional[str] = None): """ Export database or filtered data to CSV. Parameters: ----------- filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.) """ if self._combined_data is None: self.load_all_data() data_to_export = self._combined_data if species_filter: if species_filter in ['aq', 'cr', 'gas', 'liq']: data_to_export = self.get_species_by_state(species_filter) data_to_export.to_csv(filename, index=False)Export database or filtered data to CSV.
Parameters:
filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.)
def get_aqueous_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_aqueous_species(self) -> pd.DataFrame: """Get all aqueous species.""" return self.get_species_by_state('aq')Get all aqueous species.
def get_combined_data(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_combined_data(self) -> pd.DataFrame: """ Get combined OBIGT thermodynamic data. Returns ------- pd.DataFrame Combined OBIGT data with all species """ if self._combined_data is not None: return self._combined_data.copy() try: # Try to load data normally first return self.load_all_data() except Exception as e: print(f"Warning: Could not load OBIGT data: {e}") # Create minimal fallback data for essential species return self._create_fallback_data()Get combined OBIGT thermodynamic data.
Returns
pd.DataFrame- Combined OBIGT data with all species
def get_crystalline_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_crystalline_species(self) -> pd.DataFrame: """Get all crystalline species.""" return self.get_species_by_state('cr')Get all crystalline species.
def get_database_stats(self) ‑> Dict[str, int | Dict[str, int]]-
Expand source code
def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]: """ Get statistics about the database. Returns: -------- Dict Database statistics including total species, states, etc. """ if self._combined_data is None: self.load_all_data() stats = { 'total_species': len(self._combined_data), 'states': self._combined_data['state'].value_counts().to_dict(), 'source_files': self._combined_data['source_file'].value_counts().to_dict(), 'unique_names': self._combined_data['name'].nunique(), 'unique_formulas': self._combined_data['formula'].nunique(), } return statsGet statistics about the database.
Returns:
Dict Database statistics including total species, states, etc.
def get_gas_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_gas_species(self) -> pd.DataFrame: """Get all gas species.""" return self.get_species_by_state('gas')Get all gas species.
def get_liquid_species(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_liquid_species(self) -> pd.DataFrame: """Get all liquid species.""" return self.get_species_by_state('liq')Get all liquid species.
def get_species(self, identifier: str, state: str | None = None) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame: """ Get species data by name, formula, or identifier. Parameters: ----------- identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() results = [] # Try exact name match first if identifier in self._species_index: indices = self._species_index[identifier] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try formula match formula_key = f"formula:{identifier}" if formula_key in self._species_index: indices = self._species_index[formula_key] for idx in indices: row = self._combined_data.iloc[idx] if state is None or str(row.get('state', '')).strip() == state: results.append(row) # Try name+state combination if state: name_state_key = f"{identifier}({state})" if name_state_key in self._species_index: indices = self._species_index[name_state_key] for idx in indices: results.append(self._combined_data.iloc[idx]) # If no exact matches, try partial matching if not results: mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \ self._combined_data['formula'].str.contains(identifier, case=False, na=False) if state: mask &= (self._combined_data['state'] == state) partial_matches = self._combined_data[mask] results = [row for _, row in partial_matches.iterrows()] if results: return pd.DataFrame(results).reset_index(drop=True) else: return pd.DataFrame(columns=self._combined_data.columns)Get species data by name, formula, or identifier.
Parameters:
identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq')
Returns:
pd.DataFrame Matching species data
def get_species_by_elements(self, elements: List[str]) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame: """ Get species containing specific elements. Parameters: ----------- elements : List[str] List of element symbols Returns: -------- pd.DataFrame Species containing the specified elements """ if self._combined_data is None: self.load_all_data() # Create search pattern for elements pattern = '|'.join(elements) mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False) return self._combined_data[mask].reset_index(drop=True)Get species containing specific elements.
Parameters:
elements : List[str] List of element symbols
Returns:
pd.DataFrame Species containing the specified elements
def get_species_by_state(self, state: str) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_species_by_state(self, state: str) -> pd.DataFrame: """ Get all species in a specific physical state. Parameters: ----------- state : str Physical state ('aq', 'cr', 'gas', 'liq') Returns: -------- pd.DataFrame Species data for the specified state """ if self._combined_data is None: self.load_all_data() mask = self._combined_data['state'] == state return self._combined_data[mask].reset_index(drop=True)Get all species in a specific physical state.
Parameters:
state : str Physical state ('aq', 'cr', 'gas', 'liq')
Returns:
pd.DataFrame Species data for the specified state
def get_thermodynamic_properties(self, species_data: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame-
Expand source code
def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame: """ Extract thermodynamic properties from species data. Parameters: ----------- species_data : pd.DataFrame Species data from get_species or similar methods Returns: -------- pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.) """ thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T'] available_columns = [col for col in thermo_columns if col in species_data.columns] result = species_data[['name', 'formula', 'state'] + available_columns].copy() # Convert numeric columns to proper numeric types for col in available_columns: result[col] = pd.to_numeric(result[col], errors='coerce') return resultExtract thermodynamic properties from species data.
Parameters:
species_data : pd.DataFrame Species data from get_species or similar methods
Returns:
pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.)
def load_all_data(self, force_reload: bool = False) ‑> pandas.core.frame.DataFrame-
Expand source code
def load_all_data(self, force_reload: bool = False) -> pd.DataFrame: """ Load and combine all OBIGT data files. Parameters: ----------- force_reload : bool, default False Force reloading of data even if cached Returns: -------- pd.DataFrame Combined OBIGT database """ if self._combined_data is not None and not force_reload: return self._combined_data.copy() # Load all OBIGT files obigt_files = self.loader.load_all_obigt_files() # Combine all files combined_data = [] for filename, df in obigt_files.items(): # Add source file information df_copy = df.copy() df_copy['source_file'] = filename combined_data.append(df_copy) # Concatenate all data self._combined_data = pd.concat(combined_data, ignore_index=True) # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index # to match R's row numbers. Row 0 in pandas should be row 1 in R. self._combined_data.index = self._combined_data.index + 1 # Create species index for fast lookups self._create_species_index() return self._combined_data.copy()Load and combine all OBIGT data files.
Parameters:
force_reload : bool, default False Force reloading of data even if cached
Returns:
pd.DataFrame Combined OBIGT database
def search_species(self, query: str, search_columns: List[str] | None = None) ‑> pandas.core.frame.DataFrame-
Expand source code
def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame: """ Search for species using a text query. Parameters: ----------- query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv'] Returns: -------- pd.DataFrame Matching species data """ if self._combined_data is None: self.load_all_data() if search_columns is None: search_columns = ['name', 'formula', 'abbrv'] # Create search mask mask = pd.Series([False] * len(self._combined_data)) for col in search_columns: if col in self._combined_data.columns: mask |= self._combined_data[col].str.contains(query, case=False, na=False) return self._combined_data[mask].reset_index(drop=True)Search for species using a text query.
Parameters:
query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv']
Returns:
pd.DataFrame Matching species data
def validate_data(self) ‑> Dict[str, List]-
Expand source code
def validate_data(self) -> Dict[str, List]: """ Validate the OBIGT database for common issues. Returns: -------- Dict Validation results with issues found """ if self._combined_data is None: self.load_all_data() issues = { 'missing_names': [], 'missing_formulas': [], 'missing_states': [], 'invalid_numeric_values': [], 'duplicate_entries': [] } # Check for missing critical fields missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '') if missing_names.any(): issues['missing_names'] = self._combined_data[missing_names].index.tolist() missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '') if missing_formulas.any(): issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist() missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '') if missing_states.any(): issues['missing_states'] = self._combined_data[missing_states].index.tolist() # Check for invalid numeric values in key thermodynamic properties numeric_columns = ['G', 'H', 'S', 'Cp'] for col in numeric_columns: if col in self._combined_data.columns: numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce') invalid_mask = numeric_data.isna() & self._combined_data[col].notna() if invalid_mask.any(): issues['invalid_numeric_values'].extend( [(idx, col) for idx in self._combined_data[invalid_mask].index] ) # Check for potential duplicates duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False) if duplicate_mask.any(): issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist() return issuesValidate the OBIGT database for common issues.
Returns:
Dict Validation results with issues found