Module `pychnosz.data`

Data management and access for CHNOSZ thermodynamic database.

Sub-modules

pychnosz.data.add_obigt: Implementation of add_OBIGT() function for Python CHNOSZ …
pychnosz.data.loader: Data loader module for CHNOSZ thermodynamic database files …
pychnosz.data.mod_obigt: Implementation of mod_OBIGT() function for Python CHNOSZ …
pychnosz.data.obigt: OBIGT database access module …
pychnosz.data.worm: WORM database loader for CHNOSZ …

Functions

def get_default_loader() ‑> DataLoader

Expand source code

def get_default_loader() -> DataLoader:
    """
    Get a default DataLoader instance.
    
    Returns:
    --------
    DataLoader
        Default DataLoader instance
    """
    return DataLoader()

Get a default DataLoader instance.

Returns:

DataLoader Default DataLoader instance

def get_default_obigt() ‑> OBIGTDatabase

Expand source code

def get_default_obigt() -> OBIGTDatabase:
    """
    Get a default OBIGT database instance.
    
    Returns:
    --------
    OBIGTDatabase
        Default OBIGT database instance
    """
    return OBIGTDatabase()

Get a default OBIGT database instance.

Returns:

OBIGTDatabase Default OBIGT database instance

Classes

class DataLoader (data_path: str | pathlib.Path | None = None)

Expand source code

class DataLoader:
    """
    Main data loader class for CHNOSZ thermodynamic database files.
    
    This class handles loading of various data files from the CHNOSZ R package,
    including compressed files, and converts them to pandas DataFrames while
    preserving data integrity.
    """
    
    def __init__(self, data_path: Optional[Union[str, Path]] = None):
        """
        Initialize the DataLoader.

        Parameters:
        -----------
        data_path : str or Path, optional
            Path to the CHNOSZ data directory. If None, will attempt to find
            the data/extdata directory relative to this file within the package.
        """
        if data_path is None:
            # Try to find the data directory relative to this file
            # We're now in pychnosz/data/, so extdata is in the same directory
            current_dir = Path(__file__).parent
            self.data_path = current_dir / "extdata"
        else:
            self.data_path = Path(data_path)

        if not self.data_path.exists():
            raise FileNotFoundError(f"Data directory not found: {self.data_path}")

        self.obigt_path = self.data_path / "OBIGT"
        self.thermo_path = self.data_path / "thermo"

        # Cache for loaded data
        self._cache = {}
    
    def _read_csv_safe(self, filepath: Path, **kwargs) -> pd.DataFrame:
        """
        Safely read a CSV file with appropriate error handling.
        
        Parameters:
        -----------
        filepath : Path
            Path to the CSV file
        **kwargs
            Additional arguments to pass to pd.read_csv
            
        Returns:
        --------
        pd.DataFrame
            Loaded DataFrame
        """
        try:
            # Handle potential encoding issues
            encodings = ['utf-8', 'latin-1', 'cp1252']
            
            for encoding in encodings:
                try:
                    df = pd.read_csv(filepath, encoding=encoding, **kwargs)
                    return df
                except UnicodeDecodeError:
                    continue
                    
            # If all encodings fail, try with error handling
            df = pd.read_csv(filepath, encoding='utf-8', errors='replace', **kwargs)
            warnings.warn(f"Used error replacement for file {filepath}")
            return df
            
        except Exception as e:
            raise IOError(f"Failed to read {filepath}: {str(e)}")
    
    def _read_compressed_csv(self, filepath: Path, **kwargs) -> pd.DataFrame:
        """
        Read a compressed CSV file (e.g., .xz format).
        
        Parameters:
        -----------
        filepath : Path
            Path to the compressed CSV file
        **kwargs
            Additional arguments to pass to pd.read_csv
            
        Returns:
        --------
        pd.DataFrame
            Loaded DataFrame
        """
        if filepath.suffix == '.xz':
            with lzma.open(filepath, 'rt', encoding='utf-8') as f:
                df = pd.read_csv(f, **kwargs)
                return df
        else:
            raise ValueError(f"Unsupported compression format: {filepath.suffix}")
    
    def load_obigt_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
        """
        Load a specific OBIGT database file.
        
        Parameters:
        -----------
        filename : str
            Name of the OBIGT file to load (e.g., 'inorganic_aq.csv')
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        pd.DataFrame
            Loaded OBIGT data
        """
        cache_key = f"obigt_{filename}"
        
        if use_cache and cache_key in self._cache:
            return self._cache[cache_key].copy()
        
        filepath = self.obigt_path / filename
        
        if not filepath.exists():
            raise FileNotFoundError(f"OBIGT file not found: {filepath}")
        
        # Load the data
        df = self._read_csv_safe(filepath)
        
        # Clean up column names (remove any whitespace)
        df.columns = df.columns.str.strip()
        
        # Cache the result
        if use_cache:
            self._cache[cache_key] = df.copy()
            
        return df
    
    def load_all_obigt_files(self, use_cache: bool = True) -> Dict[str, pd.DataFrame]:
        """
        Load all OBIGT database files in the same order as R CHNOSZ.
        
        This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function
        to ensure identical species indices between R and Python versions.
        
        Parameters:
        -----------
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        Dict[str, pd.DataFrame]
            Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ
        """
        obigt_files = {}
        
        if not self.obigt_path.exists():
            raise FileNotFoundError(f"OBIGT directory not found: {self.obigt_path}")
        
        # Use exact same order as R CHNOSZ (from thermo.R lines 63-67)
        # sources_aq <- paste0(c("H2O", "inorganic", "organic"), "_aq")
        # sources_cr <- paste0(c("Berman", "inorganic", "organic"), "_cr")
        # sources_liq <- paste0(c("organic"), "_liq")
        # sources_gas <- paste0(c("inorganic", "organic"), "_gas")
        # sources <- c(sources_aq, sources_cr, sources_gas, sources_liq)
        r_chnosz_order = [
            "H2O_aq.csv",
            "inorganic_aq.csv", 
            "organic_aq.csv",
            "Berman_cr.csv",
            "inorganic_cr.csv",
            "organic_cr.csv", 
            "inorganic_gas.csv",
            "organic_gas.csv",
            "organic_liq.csv"
        ]
        
        # Load files in R CHNOSZ order
        for filename in r_chnosz_order:
            file_path = self.obigt_path / filename
            if file_path.exists():
                obigt_files[filename] = self.load_obigt_file(filename, use_cache=use_cache)
            else:
                warnings.warn(f"OBIGT file not found: {filename}")
            
        return obigt_files
    
    def load_thermo_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
        """
        Load a specific thermo database file.
        
        Parameters:
        -----------
        filename : str
            Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz')
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        pd.DataFrame
            Loaded thermo data
        """
        cache_key = f"thermo_{filename}"
        
        if use_cache and cache_key in self._cache:
            return self._cache[cache_key].copy()
        
        filepath = self.thermo_path / filename
        
        if not filepath.exists():
            raise FileNotFoundError(f"Thermo file not found: {filepath}")
        
        # Handle compressed files
        if filepath.suffix == '.xz':
            df = self._read_compressed_csv(filepath)
        else:
            df = self._read_csv_safe(filepath)
        
        # Clean up column names
        df.columns = df.columns.str.strip()
        
        # Cache the result
        if use_cache:
            self._cache[cache_key] = df.copy()
            
        return df
    
    def load_elements(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load the elements data file.
        
        Parameters:
        -----------
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        pd.DataFrame
            Elements data with columns: element, state, source, mass, s, n
        """
        return self.load_thermo_file('element.csv', use_cache=use_cache)
    
    def load_buffer(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load the buffer data file.
        
        Parameters:
        -----------
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        pd.DataFrame
            Buffer data with columns: name, species, state, logact
        """
        return self.load_thermo_file('buffer.csv', use_cache=use_cache)
    
    def load_protein(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load the protein data file.
        
        Parameters:
        -----------
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        pd.DataFrame
            Protein data with amino acid compositions
        """
        return self.load_thermo_file('protein.csv', use_cache=use_cache)
    
    def load_stoich(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load the stoichiometry data file (compressed).
        
        Parameters:
        -----------
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns:
        --------
        pd.DataFrame
            Stoichiometry matrix for all species
        """
        return self.load_thermo_file('stoich.csv.xz', use_cache=use_cache)
    
    def get_available_obigt_files(self) -> List[str]:
        """
        Get list of available OBIGT files.
        
        Returns:
        --------
        List[str]
            List of available OBIGT filenames
        """
        if not self.obigt_path.exists():
            return []
        
        return [f.name for f in self.obigt_path.glob("*.csv")]
    
    def get_available_thermo_files(self) -> List[str]:
        """
        Get list of available thermo files.
        
        Returns:
        --------
        List[str]
            List of available thermo filenames
        """
        if not self.thermo_path.exists():
            return []
        
        # Get both .csv and .csv.xz files
        csv_files = [f.name for f in self.thermo_path.glob("*.csv")]
        xz_files = [f.name for f in self.thermo_path.glob("*.csv.xz")]
        
        return sorted(csv_files + xz_files)
    
    def clear_cache(self):
        """Clear all cached data."""
        self._cache.clear()
    
    def get_cache_info(self) -> Dict[str, int]:
        """
        Get information about cached data.
        
        Returns:
        --------
        Dict[str, int]
            Dictionary with cache keys and DataFrame sizes
        """
        return {key: len(df) for key, df in self._cache.items()}
    
    def get_data_path(self) -> Path:
        """
        Get the data directory path.
        
        Returns
        -------
        Path
            Path to the data directory
        """
        return self.data_path
    
    def load_buffers(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load buffer data (alias for load_buffer for compatibility).
        
        Parameters
        ----------
        use_cache : bool, default True
            Whether to use cached data if available
            
        Returns
        -------
        pd.DataFrame
            Buffer data
        """
        try:
            return self.load_buffer(use_cache=use_cache)
        except Exception:
            # Return empty DataFrame if buffer data not available
            return pd.DataFrame(columns=['name', 'species', 'state', 'logact'])
    
    def load_proteins(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load protein data (alias for load_protein for compatibility).

        Parameters
        ----------
        use_cache : bool, default True
            Whether to use cached data if available

        Returns
        -------
        pd.DataFrame
            Protein data
        """
        try:
            return self.load_protein(use_cache=use_cache)
        except Exception:
            # Return empty DataFrame if protein data not available
            return pd.DataFrame(columns=['protein', 'organism', 'ref', 'abbrv', 'chains'])

    def load_refs(self, use_cache: bool = True) -> pd.DataFrame:
        """
        Load references data file.

        Parameters
        ----------
        use_cache : bool, default True
            Whether to use cached data if available

        Returns
        -------
        pd.DataFrame
            References data
        """
        try:
            return self.load_thermo_file('refs.csv', use_cache=use_cache)
        except Exception:
            # Return empty DataFrame if refs data not available
            return pd.DataFrame(columns=['key', 'author', 'year', 'citation'])

Main data loader class for CHNOSZ thermodynamic database files.

This class handles loading of various data files from the CHNOSZ R package, including compressed files, and converts them to pandas DataFrames while preserving data integrity.

Initialize the DataLoader.

Parameters:

data_path : str or Path, optional Path to the CHNOSZ data directory. If None, will attempt to find the data/extdata directory relative to this file within the package.

Methods

def clear_cache(self)

Expand source code

def clear_cache(self):
    """Clear all cached data."""
    self._cache.clear()

Clear all cached data.

def get_available_obigt_files(self) ‑> List[str]

Expand source code

def get_available_obigt_files(self) -> List[str]:
    """
    Get list of available OBIGT files.
    
    Returns:
    --------
    List[str]
        List of available OBIGT filenames
    """
    if not self.obigt_path.exists():
        return []
    
    return [f.name for f in self.obigt_path.glob("*.csv")]

Get list of available OBIGT files.

Returns:

List[str] List of available OBIGT filenames

def get_available_thermo_files(self) ‑> List[str]

Expand source code

def get_available_thermo_files(self) -> List[str]:
    """
    Get list of available thermo files.
    
    Returns:
    --------
    List[str]
        List of available thermo filenames
    """
    if not self.thermo_path.exists():
        return []
    
    # Get both .csv and .csv.xz files
    csv_files = [f.name for f in self.thermo_path.glob("*.csv")]
    xz_files = [f.name for f in self.thermo_path.glob("*.csv.xz")]
    
    return sorted(csv_files + xz_files)

Get list of available thermo files.

Returns:

List[str] List of available thermo filenames

def get_cache_info(self) ‑> Dict[str, int]

Expand source code

def get_cache_info(self) -> Dict[str, int]:
    """
    Get information about cached data.
    
    Returns:
    --------
    Dict[str, int]
        Dictionary with cache keys and DataFrame sizes
    """
    return {key: len(df) for key, df in self._cache.items()}

Get information about cached data.

Returns:

Dict[str, int] Dictionary with cache keys and DataFrame sizes

def get_data_path(self) ‑> pathlib.Path

Expand source code

def get_data_path(self) -> Path:
    """
    Get the data directory path.
    
    Returns
    -------
    Path
        Path to the data directory
    """
    return self.data_path

Get the data directory path.

Returns

Path: Path to the data directory

def load_all_obigt_files(self, use_cache: bool = True) ‑> Dict[str, pandas.core.frame.DataFrame]

Expand source code

def load_all_obigt_files(self, use_cache: bool = True) -> Dict[str, pd.DataFrame]:
    """
    Load all OBIGT database files in the same order as R CHNOSZ.
    
    This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function
    to ensure identical species indices between R and Python versions.
    
    Parameters:
    -----------
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    Dict[str, pd.DataFrame]
        Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ
    """
    obigt_files = {}
    
    if not self.obigt_path.exists():
        raise FileNotFoundError(f"OBIGT directory not found: {self.obigt_path}")
    
    # Use exact same order as R CHNOSZ (from thermo.R lines 63-67)
    # sources_aq <- paste0(c("H2O", "inorganic", "organic"), "_aq")
    # sources_cr <- paste0(c("Berman", "inorganic", "organic"), "_cr")
    # sources_liq <- paste0(c("organic"), "_liq")
    # sources_gas <- paste0(c("inorganic", "organic"), "_gas")
    # sources <- c(sources_aq, sources_cr, sources_gas, sources_liq)
    r_chnosz_order = [
        "H2O_aq.csv",
        "inorganic_aq.csv", 
        "organic_aq.csv",
        "Berman_cr.csv",
        "inorganic_cr.csv",
        "organic_cr.csv", 
        "inorganic_gas.csv",
        "organic_gas.csv",
        "organic_liq.csv"
    ]
    
    # Load files in R CHNOSZ order
    for filename in r_chnosz_order:
        file_path = self.obigt_path / filename
        if file_path.exists():
            obigt_files[filename] = self.load_obigt_file(filename, use_cache=use_cache)
        else:
            warnings.warn(f"OBIGT file not found: {filename}")
        
    return obigt_files

Load all OBIGT database files in the same order as R CHNOSZ.

This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function to ensure identical species indices between R and Python versions.

Parameters:

use_cache : bool, default True Whether to use cached data if available

Returns:

Dict[str, pd.DataFrame] Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ

def load_buffer(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_buffer(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load the buffer data file.
    
    Parameters:
    -----------
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    pd.DataFrame
        Buffer data with columns: name, species, state, logact
    """
    return self.load_thermo_file('buffer.csv', use_cache=use_cache)

Load the buffer data file.

Parameters:

use_cache : bool, default True Whether to use cached data if available

Returns:

pd.DataFrame Buffer data with columns: name, species, state, logact

def load_buffers(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_buffers(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load buffer data (alias for load_buffer for compatibility).
    
    Parameters
    ----------
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns
    -------
    pd.DataFrame
        Buffer data
    """
    try:
        return self.load_buffer(use_cache=use_cache)
    except Exception:
        # Return empty DataFrame if buffer data not available
        return pd.DataFrame(columns=['name', 'species', 'state', 'logact'])

Load buffer data (alias for load_buffer for compatibility).

Parameters

use_cache : bool, default True: Whether to use cached data if available

Returns

pd.DataFrame: Buffer data

def load_elements(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_elements(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load the elements data file.
    
    Parameters:
    -----------
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    pd.DataFrame
        Elements data with columns: element, state, source, mass, s, n
    """
    return self.load_thermo_file('element.csv', use_cache=use_cache)

Load the elements data file.

Parameters:

use_cache : bool, default True Whether to use cached data if available

Returns:

pd.DataFrame Elements data with columns: element, state, source, mass, s, n

def load_obigt_file(self, filename: str, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_obigt_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
    """
    Load a specific OBIGT database file.
    
    Parameters:
    -----------
    filename : str
        Name of the OBIGT file to load (e.g., 'inorganic_aq.csv')
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    pd.DataFrame
        Loaded OBIGT data
    """
    cache_key = f"obigt_{filename}"
    
    if use_cache and cache_key in self._cache:
        return self._cache[cache_key].copy()
    
    filepath = self.obigt_path / filename
    
    if not filepath.exists():
        raise FileNotFoundError(f"OBIGT file not found: {filepath}")
    
    # Load the data
    df = self._read_csv_safe(filepath)
    
    # Clean up column names (remove any whitespace)
    df.columns = df.columns.str.strip()
    
    # Cache the result
    if use_cache:
        self._cache[cache_key] = df.copy()
        
    return df

Load a specific OBIGT database file.

Parameters:

filename : str Name of the OBIGT file to load (e.g., 'inorganic_aq.csv') use_cache : bool, default True Whether to use cached data if available

Returns:

pd.DataFrame Loaded OBIGT data

def load_protein(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_protein(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load the protein data file.
    
    Parameters:
    -----------
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    pd.DataFrame
        Protein data with amino acid compositions
    """
    return self.load_thermo_file('protein.csv', use_cache=use_cache)

Load the protein data file.

Parameters:

use_cache : bool, default True Whether to use cached data if available

Returns:

pd.DataFrame Protein data with amino acid compositions

def load_proteins(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_proteins(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load protein data (alias for load_protein for compatibility).

    Parameters
    ----------
    use_cache : bool, default True
        Whether to use cached data if available

    Returns
    -------
    pd.DataFrame
        Protein data
    """
    try:
        return self.load_protein(use_cache=use_cache)
    except Exception:
        # Return empty DataFrame if protein data not available
        return pd.DataFrame(columns=['protein', 'organism', 'ref', 'abbrv', 'chains'])

Load protein data (alias for load_protein for compatibility).

Parameters

use_cache : bool, default True: Whether to use cached data if available

Returns

pd.DataFrame: Protein data

def load_refs(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_refs(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load references data file.

    Parameters
    ----------
    use_cache : bool, default True
        Whether to use cached data if available

    Returns
    -------
    pd.DataFrame
        References data
    """
    try:
        return self.load_thermo_file('refs.csv', use_cache=use_cache)
    except Exception:
        # Return empty DataFrame if refs data not available
        return pd.DataFrame(columns=['key', 'author', 'year', 'citation'])

Load references data file.

Parameters

use_cache : bool, default True: Whether to use cached data if available

Returns

pd.DataFrame: References data

def load_stoich(self, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_stoich(self, use_cache: bool = True) -> pd.DataFrame:
    """
    Load the stoichiometry data file (compressed).
    
    Parameters:
    -----------
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    pd.DataFrame
        Stoichiometry matrix for all species
    """
    return self.load_thermo_file('stoich.csv.xz', use_cache=use_cache)

Load the stoichiometry data file (compressed).

Parameters:

use_cache : bool, default True Whether to use cached data if available

Returns:

pd.DataFrame Stoichiometry matrix for all species

def load_thermo_file(self, filename: str, use_cache: bool = True) ‑> pandas.core.frame.DataFrame

Expand source code

def load_thermo_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
    """
    Load a specific thermo database file.
    
    Parameters:
    -----------
    filename : str
        Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz')
    use_cache : bool, default True
        Whether to use cached data if available
        
    Returns:
    --------
    pd.DataFrame
        Loaded thermo data
    """
    cache_key = f"thermo_{filename}"
    
    if use_cache and cache_key in self._cache:
        return self._cache[cache_key].copy()
    
    filepath = self.thermo_path / filename
    
    if not filepath.exists():
        raise FileNotFoundError(f"Thermo file not found: {filepath}")
    
    # Handle compressed files
    if filepath.suffix == '.xz':
        df = self._read_compressed_csv(filepath)
    else:
        df = self._read_csv_safe(filepath)
    
    # Clean up column names
    df.columns = df.columns.str.strip()
    
    # Cache the result
    if use_cache:
        self._cache[cache_key] = df.copy()
        
    return df

Load a specific thermo database file.

Parameters:

filename : str Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz') use_cache : bool, default True Whether to use cached data if available

Returns:

pd.DataFrame Loaded thermo data

class OBIGTDatabase (data_loader: DataLoader | None = None)

Expand source code

class OBIGTDatabase:
    """
    High-level interface to the OBIGT thermodynamic database.
    
    This class provides methods to access, search, and manipulate the
    thermodynamic data from the OBIGT database files.
    """
    
    def __init__(self, data_loader: Optional[DataLoader] = None):
        """
        Initialize the OBIGT database.
        
        Parameters:
        -----------
        data_loader : DataLoader, optional
            DataLoader instance to use. If None, creates a default loader.
        """
        if data_loader is None:
            from .loader import get_default_loader
            self.loader = get_default_loader()
        else:
            self.loader = data_loader
            
        # Cache for combined data
        self._combined_data = None
        self._species_index = None
        
        # Define the expected columns for OBIGT data
        self.obigt_columns = [
            'name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'model',
            'E_units', 'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d',
            'c1.e', 'c2.f', 'omega.lambda', 'z.T'
        ]
        
        # State classifications
        self.aqueous_states = ['aq']
        self.crystalline_states = ['cr']
        self.gas_states = ['gas']
        self.liquid_states = ['liq']
    
    def load_all_data(self, force_reload: bool = False) -> pd.DataFrame:
        """
        Load and combine all OBIGT data files.
        
        Parameters:
        -----------
        force_reload : bool, default False
            Force reloading of data even if cached
            
        Returns:
        --------
        pd.DataFrame
            Combined OBIGT database
        """
        if self._combined_data is not None and not force_reload:
            return self._combined_data.copy()
        
        # Load all OBIGT files
        obigt_files = self.loader.load_all_obigt_files()
        
        # Combine all files
        combined_data = []
        
        for filename, df in obigt_files.items():
            # Add source file information
            df_copy = df.copy()
            df_copy['source_file'] = filename
            combined_data.append(df_copy)
        
        # Concatenate all data
        self._combined_data = pd.concat(combined_data, ignore_index=True)

        # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index
        # to match R's row numbers. Row 0 in pandas should be row 1 in R.
        self._combined_data.index = self._combined_data.index + 1

        # Create species index for fast lookups
        self._create_species_index()
        
        return self._combined_data.copy()
    
    def get_combined_data(self) -> pd.DataFrame:
        """
        Get combined OBIGT thermodynamic data.
        
        Returns
        -------
        pd.DataFrame
            Combined OBIGT data with all species
        """
        if self._combined_data is not None:
            return self._combined_data.copy()
        
        try:
            # Try to load data normally first
            return self.load_all_data()
        except Exception as e:
            print(f"Warning: Could not load OBIGT data: {e}")
            # Create minimal fallback data for essential species
            return self._create_fallback_data()
    
    def _create_fallback_data(self) -> pd.DataFrame:
        """Create minimal fallback data for essential species."""
        
        # Essential species data (approximate values for basic functionality)
        fallback_data = {
            'name': ['water', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
            'abbrv': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
            'formula': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
            'state': ['liq', 'aq', 'aq', 'aq', 'aq', 'aq'],
            'G': [-56688.1, 0.0, -37595.0, -92307.0, -140314.0, -126172.0],
            'H': [-68317.0, 0.0, -54977.0, -98900.0, -165180.0, -161963.0],
            'S': [16.712, 0.0, -2.56, -39.75, 98.4, -50.0],
            'Cp': [18.0, 0.0, -36.4, 37.11, 25.0, -53.1],
            'V': [18.068, 0.0, -4.71, 34.0, 25.0, -6.0],
            'z.T': [0, 1, -1, 0, -1, -2],
            'ref1': ['', '', '', '', '', ''],
            'ref2': ['', '', '', '', '', ''],
            'date': ['', '', '', '', '', ''],
            'model': ['', '', '', '', '', ''],
            'E_units': ['', '', '', '', '', ''],
            'a1.a': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'a2.b': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'a3.c': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'a4.d': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'c1.e': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'c2.f': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'omega.lambda': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        }
        
        df = pd.DataFrame(fallback_data)
        
        # Cache the fallback data
        self._combined_data = df
        self._create_species_index()
        
        return df.copy()
    
    def _create_species_index(self):
        """Create an index for fast species lookups."""
        if self._combined_data is None:
            return
            
        # Create multi-level index for name, formula, and state
        self._species_index = {}
        
        for idx, row in self._combined_data.iterrows():
            name = str(row.get('name', '')).strip()
            formula = str(row.get('formula', '')).strip()
            state = str(row.get('state', '')).strip()
            
            # Index by name
            if name and name not in self._species_index:
                self._species_index[name] = []
            if name:
                self._species_index[name].append(idx)
            
            # Index by formula
            formula_key = f"formula:{formula}"
            if formula and formula_key not in self._species_index:
                self._species_index[formula_key] = []
            if formula:
                self._species_index[formula_key].append(idx)
            
            # Index by name+state combination
            name_state_key = f"{name}({state})"
            if name and state and name_state_key not in self._species_index:
                self._species_index[name_state_key] = []
            if name and state:
                self._species_index[name_state_key].append(idx)
    
    def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame:
        """
        Get species data by name, formula, or identifier.
        
        Parameters:
        -----------
        identifier : str
            Species name, formula, or identifier
        state : str, optional
            Physical state ('aq', 'cr', 'gas', 'liq')
            
        Returns:
        --------
        pd.DataFrame
            Matching species data
        """
        if self._combined_data is None:
            self.load_all_data()
        
        results = []
        
        # Try exact name match first
        if identifier in self._species_index:
            indices = self._species_index[identifier]
            for idx in indices:
                row = self._combined_data.iloc[idx]
                if state is None or str(row.get('state', '')).strip() == state:
                    results.append(row)
        
        # Try formula match
        formula_key = f"formula:{identifier}"
        if formula_key in self._species_index:
            indices = self._species_index[formula_key]
            for idx in indices:
                row = self._combined_data.iloc[idx]
                if state is None or str(row.get('state', '')).strip() == state:
                    results.append(row)
        
        # Try name+state combination
        if state:
            name_state_key = f"{identifier}({state})"
            if name_state_key in self._species_index:
                indices = self._species_index[name_state_key]
                for idx in indices:
                    results.append(self._combined_data.iloc[idx])
        
        # If no exact matches, try partial matching
        if not results:
            mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \
                   self._combined_data['formula'].str.contains(identifier, case=False, na=False)
            
            if state:
                mask &= (self._combined_data['state'] == state)
            
            partial_matches = self._combined_data[mask]
            results = [row for _, row in partial_matches.iterrows()]
        
        if results:
            return pd.DataFrame(results).reset_index(drop=True)
        else:
            return pd.DataFrame(columns=self._combined_data.columns)
    
    def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame:
        """
        Search for species using a text query.
        
        Parameters:
        -----------
        query : str
            Search query
        search_columns : List[str], optional
            Columns to search in. Default: ['name', 'formula', 'abbrv']
            
        Returns:
        --------
        pd.DataFrame
            Matching species data
        """
        if self._combined_data is None:
            self.load_all_data()
        
        if search_columns is None:
            search_columns = ['name', 'formula', 'abbrv']
        
        # Create search mask
        mask = pd.Series([False] * len(self._combined_data))
        
        for col in search_columns:
            if col in self._combined_data.columns:
                mask |= self._combined_data[col].str.contains(query, case=False, na=False)
        
        return self._combined_data[mask].reset_index(drop=True)
    
    def get_species_by_state(self, state: str) -> pd.DataFrame:
        """
        Get all species in a specific physical state.
        
        Parameters:
        -----------
        state : str
            Physical state ('aq', 'cr', 'gas', 'liq')
            
        Returns:
        --------
        pd.DataFrame
            Species data for the specified state
        """
        if self._combined_data is None:
            self.load_all_data()
        
        mask = self._combined_data['state'] == state
        return self._combined_data[mask].reset_index(drop=True)
    
    def get_aqueous_species(self) -> pd.DataFrame:
        """Get all aqueous species."""
        return self.get_species_by_state('aq')
    
    def get_crystalline_species(self) -> pd.DataFrame:
        """Get all crystalline species."""
        return self.get_species_by_state('cr')
    
    def get_gas_species(self) -> pd.DataFrame:
        """Get all gas species."""
        return self.get_species_by_state('gas')
    
    def get_liquid_species(self) -> pd.DataFrame:
        """Get all liquid species."""
        return self.get_species_by_state('liq')
    
    def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame:
        """
        Get species containing specific elements.
        
        Parameters:
        -----------
        elements : List[str]
            List of element symbols
            
        Returns:
        --------
        pd.DataFrame
            Species containing the specified elements
        """
        if self._combined_data is None:
            self.load_all_data()
        
        # Create search pattern for elements
        pattern = '|'.join(elements)
        mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False)
        
        return self._combined_data[mask].reset_index(drop=True)
    
    def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame:
        """
        Extract thermodynamic properties from species data.
        
        Parameters:
        -----------
        species_data : pd.DataFrame
            Species data from get_species or similar methods
            
        Returns:
        --------
        pd.DataFrame
            Thermodynamic properties (G, H, S, Cp, V, etc.)
        """
        thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 
                         'c1.e', 'c2.f', 'omega.lambda', 'z.T']
        
        available_columns = [col for col in thermo_columns if col in species_data.columns]
        
        result = species_data[['name', 'formula', 'state'] + available_columns].copy()
        
        # Convert numeric columns to proper numeric types
        for col in available_columns:
            result[col] = pd.to_numeric(result[col], errors='coerce')
        
        return result
    
    def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]:
        """
        Get statistics about the database.
        
        Returns:
        --------
        Dict
            Database statistics including total species, states, etc.
        """
        if self._combined_data is None:
            self.load_all_data()
        
        stats = {
            'total_species': len(self._combined_data),
            'states': self._combined_data['state'].value_counts().to_dict(),
            'source_files': self._combined_data['source_file'].value_counts().to_dict(),
            'unique_names': self._combined_data['name'].nunique(),
            'unique_formulas': self._combined_data['formula'].nunique(),
        }
        
        return stats
    
    def validate_data(self) -> Dict[str, List]:
        """
        Validate the OBIGT database for common issues.
        
        Returns:
        --------
        Dict
            Validation results with issues found
        """
        if self._combined_data is None:
            self.load_all_data()
        
        issues = {
            'missing_names': [],
            'missing_formulas': [],
            'missing_states': [],
            'invalid_numeric_values': [],
            'duplicate_entries': []
        }
        
        # Check for missing critical fields
        missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '')
        if missing_names.any():
            issues['missing_names'] = self._combined_data[missing_names].index.tolist()
        
        missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '')
        if missing_formulas.any():
            issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist()
        
        missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '')
        if missing_states.any():
            issues['missing_states'] = self._combined_data[missing_states].index.tolist()
        
        # Check for invalid numeric values in key thermodynamic properties
        numeric_columns = ['G', 'H', 'S', 'Cp']
        for col in numeric_columns:
            if col in self._combined_data.columns:
                numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce')
                invalid_mask = numeric_data.isna() & self._combined_data[col].notna()
                if invalid_mask.any():
                    issues['invalid_numeric_values'].extend(
                        [(idx, col) for idx in self._combined_data[invalid_mask].index]
                    )
        
        # Check for potential duplicates
        duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False)
        if duplicate_mask.any():
            issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist()
        
        return issues
    
    def export_to_csv(self, filename: str, species_filter: Optional[str] = None):
        """
        Export database or filtered data to CSV.
        
        Parameters:
        -----------
        filename : str
            Output filename
        species_filter : str, optional
            Filter to apply (state name like 'aq', 'cr', etc.)
        """
        if self._combined_data is None:
            self.load_all_data()
        
        data_to_export = self._combined_data
        
        if species_filter:
            if species_filter in ['aq', 'cr', 'gas', 'liq']:
                data_to_export = self.get_species_by_state(species_filter)
        
        data_to_export.to_csv(filename, index=False)

High-level interface to the OBIGT thermodynamic database.

This class provides methods to access, search, and manipulate the thermodynamic data from the OBIGT database files.

Initialize the OBIGT database.

Parameters:

data_loader : DataLoader, optional DataLoader instance to use. If None, creates a default loader.

Methods

def export_to_csv(self, filename: str, species_filter: str | None = None)

Expand source code

def export_to_csv(self, filename: str, species_filter: Optional[str] = None):
    """
    Export database or filtered data to CSV.
    
    Parameters:
    -----------
    filename : str
        Output filename
    species_filter : str, optional
        Filter to apply (state name like 'aq', 'cr', etc.)
    """
    if self._combined_data is None:
        self.load_all_data()
    
    data_to_export = self._combined_data
    
    if species_filter:
        if species_filter in ['aq', 'cr', 'gas', 'liq']:
            data_to_export = self.get_species_by_state(species_filter)
    
    data_to_export.to_csv(filename, index=False)

Export database or filtered data to CSV.

Parameters:

filename : str Output filename species_filter : str, optional Filter to apply (state name like 'aq', 'cr', etc.)

def get_aqueous_species(self) ‑> pandas.core.frame.DataFrame

Expand source code

def get_aqueous_species(self) -> pd.DataFrame:
    """Get all aqueous species."""
    return self.get_species_by_state('aq')

Get all aqueous species.

def get_combined_data(self) ‑> pandas.core.frame.DataFrame

Expand source code

def get_combined_data(self) -> pd.DataFrame:
    """
    Get combined OBIGT thermodynamic data.
    
    Returns
    -------
    pd.DataFrame
        Combined OBIGT data with all species
    """
    if self._combined_data is not None:
        return self._combined_data.copy()
    
    try:
        # Try to load data normally first
        return self.load_all_data()
    except Exception as e:
        print(f"Warning: Could not load OBIGT data: {e}")
        # Create minimal fallback data for essential species
        return self._create_fallback_data()

Get combined OBIGT thermodynamic data.

Returns

pd.DataFrame: Combined OBIGT data with all species

def get_crystalline_species(self) ‑> pandas.core.frame.DataFrame

Expand source code

def get_crystalline_species(self) -> pd.DataFrame:
    """Get all crystalline species."""
    return self.get_species_by_state('cr')

Get all crystalline species.

def get_database_stats(self) ‑> Dict[str, int | Dict[str, int]]

Expand source code

def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]:
    """
    Get statistics about the database.
    
    Returns:
    --------
    Dict
        Database statistics including total species, states, etc.
    """
    if self._combined_data is None:
        self.load_all_data()
    
    stats = {
        'total_species': len(self._combined_data),
        'states': self._combined_data['state'].value_counts().to_dict(),
        'source_files': self._combined_data['source_file'].value_counts().to_dict(),
        'unique_names': self._combined_data['name'].nunique(),
        'unique_formulas': self._combined_data['formula'].nunique(),
    }
    
    return stats

Get statistics about the database.

Returns:

Dict Database statistics including total species, states, etc.

def get_gas_species(self) ‑> pandas.core.frame.DataFrame

Expand source code

def get_gas_species(self) -> pd.DataFrame:
    """Get all gas species."""
    return self.get_species_by_state('gas')

Get all gas species.

def get_liquid_species(self) ‑> pandas.core.frame.DataFrame

Expand source code

def get_liquid_species(self) -> pd.DataFrame:
    """Get all liquid species."""
    return self.get_species_by_state('liq')

Get all liquid species.

def get_species(self, identifier: str, state: str | None = None) ‑> pandas.core.frame.DataFrame

Expand source code

def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame:
    """
    Get species data by name, formula, or identifier.
    
    Parameters:
    -----------
    identifier : str
        Species name, formula, or identifier
    state : str, optional
        Physical state ('aq', 'cr', 'gas', 'liq')
        
    Returns:
    --------
    pd.DataFrame
        Matching species data
    """
    if self._combined_data is None:
        self.load_all_data()
    
    results = []
    
    # Try exact name match first
    if identifier in self._species_index:
        indices = self._species_index[identifier]
        for idx in indices:
            row = self._combined_data.iloc[idx]
            if state is None or str(row.get('state', '')).strip() == state:
                results.append(row)
    
    # Try formula match
    formula_key = f"formula:{identifier}"
    if formula_key in self._species_index:
        indices = self._species_index[formula_key]
        for idx in indices:
            row = self._combined_data.iloc[idx]
            if state is None or str(row.get('state', '')).strip() == state:
                results.append(row)
    
    # Try name+state combination
    if state:
        name_state_key = f"{identifier}({state})"
        if name_state_key in self._species_index:
            indices = self._species_index[name_state_key]
            for idx in indices:
                results.append(self._combined_data.iloc[idx])
    
    # If no exact matches, try partial matching
    if not results:
        mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \
               self._combined_data['formula'].str.contains(identifier, case=False, na=False)
        
        if state:
            mask &= (self._combined_data['state'] == state)
        
        partial_matches = self._combined_data[mask]
        results = [row for _, row in partial_matches.iterrows()]
    
    if results:
        return pd.DataFrame(results).reset_index(drop=True)
    else:
        return pd.DataFrame(columns=self._combined_data.columns)

Get species data by name, formula, or identifier.

Parameters:

identifier : str Species name, formula, or identifier state : str, optional Physical state ('aq', 'cr', 'gas', 'liq')

Returns:

pd.DataFrame Matching species data

def get_species_by_elements(self, elements: List[str]) ‑> pandas.core.frame.DataFrame

Expand source code

def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame:
    """
    Get species containing specific elements.
    
    Parameters:
    -----------
    elements : List[str]
        List of element symbols
        
    Returns:
    --------
    pd.DataFrame
        Species containing the specified elements
    """
    if self._combined_data is None:
        self.load_all_data()
    
    # Create search pattern for elements
    pattern = '|'.join(elements)
    mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False)
    
    return self._combined_data[mask].reset_index(drop=True)

Get species containing specific elements.

Parameters:

elements : List[str] List of element symbols

Returns:

pd.DataFrame Species containing the specified elements

def get_species_by_state(self, state: str) ‑> pandas.core.frame.DataFrame

Expand source code

def get_species_by_state(self, state: str) -> pd.DataFrame:
    """
    Get all species in a specific physical state.
    
    Parameters:
    -----------
    state : str
        Physical state ('aq', 'cr', 'gas', 'liq')
        
    Returns:
    --------
    pd.DataFrame
        Species data for the specified state
    """
    if self._combined_data is None:
        self.load_all_data()
    
    mask = self._combined_data['state'] == state
    return self._combined_data[mask].reset_index(drop=True)

Get all species in a specific physical state.

Parameters:

state : str Physical state ('aq', 'cr', 'gas', 'liq')

Returns:

pd.DataFrame Species data for the specified state

def get_thermodynamic_properties(self, species_data: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame

Expand source code

def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame:
    """
    Extract thermodynamic properties from species data.
    
    Parameters:
    -----------
    species_data : pd.DataFrame
        Species data from get_species or similar methods
        
    Returns:
    --------
    pd.DataFrame
        Thermodynamic properties (G, H, S, Cp, V, etc.)
    """
    thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 
                     'c1.e', 'c2.f', 'omega.lambda', 'z.T']
    
    available_columns = [col for col in thermo_columns if col in species_data.columns]
    
    result = species_data[['name', 'formula', 'state'] + available_columns].copy()
    
    # Convert numeric columns to proper numeric types
    for col in available_columns:
        result[col] = pd.to_numeric(result[col], errors='coerce')
    
    return result

Extract thermodynamic properties from species data.

Parameters:

species_data : pd.DataFrame Species data from get_species or similar methods

Returns:

pd.DataFrame Thermodynamic properties (G, H, S, Cp, V, etc.)

def load_all_data(self, force_reload: bool = False) ‑> pandas.core.frame.DataFrame

Expand source code

def load_all_data(self, force_reload: bool = False) -> pd.DataFrame:
    """
    Load and combine all OBIGT data files.
    
    Parameters:
    -----------
    force_reload : bool, default False
        Force reloading of data even if cached
        
    Returns:
    --------
    pd.DataFrame
        Combined OBIGT database
    """
    if self._combined_data is not None and not force_reload:
        return self._combined_data.copy()
    
    # Load all OBIGT files
    obigt_files = self.loader.load_all_obigt_files()
    
    # Combine all files
    combined_data = []
    
    for filename, df in obigt_files.items():
        # Add source file information
        df_copy = df.copy()
        df_copy['source_file'] = filename
        combined_data.append(df_copy)
    
    # Concatenate all data
    self._combined_data = pd.concat(combined_data, ignore_index=True)

    # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index
    # to match R's row numbers. Row 0 in pandas should be row 1 in R.
    self._combined_data.index = self._combined_data.index + 1

    # Create species index for fast lookups
    self._create_species_index()
    
    return self._combined_data.copy()

Load and combine all OBIGT data files.

Parameters:

force_reload : bool, default False Force reloading of data even if cached

Returns:

pd.DataFrame Combined OBIGT database

def search_species(self, query: str, search_columns: List[str] | None = None) ‑> pandas.core.frame.DataFrame

Expand source code

def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Search for species using a text query.
    
    Parameters:
    -----------
    query : str
        Search query
    search_columns : List[str], optional
        Columns to search in. Default: ['name', 'formula', 'abbrv']
        
    Returns:
    --------
    pd.DataFrame
        Matching species data
    """
    if self._combined_data is None:
        self.load_all_data()
    
    if search_columns is None:
        search_columns = ['name', 'formula', 'abbrv']
    
    # Create search mask
    mask = pd.Series([False] * len(self._combined_data))
    
    for col in search_columns:
        if col in self._combined_data.columns:
            mask |= self._combined_data[col].str.contains(query, case=False, na=False)
    
    return self._combined_data[mask].reset_index(drop=True)

Search for species using a text query.

Parameters:

query : str Search query search_columns : List[str], optional Columns to search in. Default: ['name', 'formula', 'abbrv']

Returns:

pd.DataFrame Matching species data

def validate_data(self) ‑> Dict[str, List]

Expand source code

def validate_data(self) -> Dict[str, List]:
    """
    Validate the OBIGT database for common issues.
    
    Returns:
    --------
    Dict
        Validation results with issues found
    """
    if self._combined_data is None:
        self.load_all_data()
    
    issues = {
        'missing_names': [],
        'missing_formulas': [],
        'missing_states': [],
        'invalid_numeric_values': [],
        'duplicate_entries': []
    }
    
    # Check for missing critical fields
    missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '')
    if missing_names.any():
        issues['missing_names'] = self._combined_data[missing_names].index.tolist()
    
    missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '')
    if missing_formulas.any():
        issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist()
    
    missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '')
    if missing_states.any():
        issues['missing_states'] = self._combined_data[missing_states].index.tolist()
    
    # Check for invalid numeric values in key thermodynamic properties
    numeric_columns = ['G', 'H', 'S', 'Cp']
    for col in numeric_columns:
        if col in self._combined_data.columns:
            numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce')
            invalid_mask = numeric_data.isna() & self._combined_data[col].notna()
            if invalid_mask.any():
                issues['invalid_numeric_values'].extend(
                    [(idx, col) for idx in self._combined_data[invalid_mask].index]
                )
    
    # Check for potential duplicates
    duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False)
    if duplicate_mask.any():
        issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist()
    
    return issues

Validate the OBIGT database for common issues.

Returns:

Dict Validation results with issues found