Source code for robocrys.condense.mineral

"""This module provides tools for matching structures to known mineral class."""

from __future__ import annotations

from importlib.resources import files as import_resource_file
from itertools import islice
from pathlib import Path
from typing import TYPE_CHECKING, Any

import numpy as np
from matminer.utils.io import load_dataframe_from_json
from pymatgen.analysis.prototypes import AflowPrototypeMatcher
from pymatgen.core.structure import IStructure
from robocrys.condense.fingerprint import (
    get_fingerprint_distance,
    get_structure_fingerprint,
)


if TYPE_CHECKING:
    import pandas as pd

_mineral_db_file = import_resource_file("robocrys.condense") / "mineral_db.json.gz"


[docs]class MineralMatcher: """Class to match a structure to a mineral name. Uses a precomputed database of minerals and their fingerprints, extracted from the AFLOW prototype database. For more information on this database see reference [aflow]_: .. [aflow] Mehl, M. J., Hicks, D., Toher, C., Levy, O., Hanson, R. M., Hart, G., & Curtarolo, S. (2017), The AFLOW library of crystallographic prototypes: part 1. Computational Materials Science, 136, S1-S828. doi: 10.1016/j.commatsci.2017.01.017 Args: initial_ltol: The fractional length tolerance used in the AFLOW structure matching. initial_stol : The site coordinate tolerance used in the AFLOW structure matching. initial_angle_tol: The angle tolerance used in the AFLOW structure matching. use_fingerprint_matching: Whether to use the fingerprint distance to match minerals. fingerprint_distance_cutoff: Cutoff to determine how similar a match must be to be returned. The distance is measured between the structural fingerprints in euclidean space. mineral_db : Optional path or pandas .DataFrame object containing the mineral fingerprint database. """ def __init__( self, initial_ltol: float = 0.2, initial_stol: float = 0.3, initial_angle_tol: float = 5.0, use_fingerprint_matching: bool = True, fingerprint_distance_cutoff: float = 0.4, mineral_db: str | Path | pd.DataFrame | None = None, ): self.mineral_db = mineral_db if mineral_db is not None else _mineral_db_file if isinstance(self.mineral_db, (str, Path)): self.mineral_db = load_dataframe_from_json(self.mineral_db) self.initial_ltol = initial_ltol self.initial_stol = initial_stol self.initial_angle_tol = initial_angle_tol self.fingerprint_distance_cutoff = fingerprint_distance_cutoff self.use_fingerprint_matching = use_fingerprint_matching self._structure = None self._mineral_db = None
[docs] def get_best_mineral_name(self, structure: IStructure) -> dict[str, Any]: """Gets the "best" mineral name for a structure. Uses a combination of AFLOW prototype matching and fingerprinting to get the best mineral name. The AFLOW structure prototypes are detailed in reference [aflow]_. The algorithm works as follows: 1. Check for AFLOW match. If single match return mineral name. 2. If multiple matches, return the one with the smallest fingerprint distance. 3. If no AFLOW match, get fingerprints within tolerance. If there are any matches, take the one with the smallest distance. 4. If no fingerprints within tolerance, check get fingerprints without constraining the number of species types. If any matches, take the best one. Args: structure (Structure): A pymatgen ``Structure`` object to match. Return: (dict): The mineral name information. Stored as a dict with the keys "type", "distance", "n_species_types_match", corresponding to the mineral name, the fingerprint distance between the prototype and known mineral, and whether the number of species types in the structure matches the number in the known prototype, respectively. If no mineral match is determined, the mineral type will be ``None``. If an AFLOW match is found, the distance will be set to -1. """ self._set_distance_matrix(structure) # pre-calculate distance matrix aflow_matches = self.get_aflow_matches(structure) fingerprint_matches = self.get_fingerprint_matches(structure) fingerprint_derived = self.get_fingerprint_matches(structure, match_n_sp=False) distance = -1 n_species_types_match = True if aflow_matches: # mineral db sorted by fingerprint distance so first result always # has a smaller distance mineral = aflow_matches[0]["type"] elif fingerprint_matches and self.use_fingerprint_matching: mineral = fingerprint_matches[0]["type"] distance = fingerprint_matches[0]["distance"] elif fingerprint_derived and self.use_fingerprint_matching: mineral = fingerprint_derived[0]["type"] distance = fingerprint_derived[0]["distance"] n_species_types_match = False else: mineral = None return { "type": mineral, "distance": distance, "n_species_type_match": n_species_types_match, }
[docs] def get_aflow_matches( self, structure: IStructure, ) -> list[dict[str, Any]] | None: """Gets minerals for a structure by matching to AFLOW prototypes. Overrides :class:`pymatgen.analysis.aflow_prototypes.AflowPrototypeMatcher` to only return matches to prototypes with known mineral names. The AFLOW tolerance parameters (defined in the init method) are passed to a :class:`pymatgen.analysis.structure_matcher.StructureMatcher` object. The tolerances are gradually decreased until only a single match is found (if possible). The AFLOW structure prototypes are detailed in reference [aflow]_. Args: structure: A pymatgen structure to match. Returns: A :obj:`list` of :obj:`dict`, sorted by how close the match is, with the keys 'type', 'distance', 'structure'. Distance is the euclidean distance between the structure and prototype fingerprints. If no match was found within the tolerances, ``None`` will be returned. """ self._set_distance_matrix(structure) # redefine AflowPrototypeMatcher._match_prototype function to run over # our custom pandas DataFrame of AFLOW prototypes. This DataFrame only # contains entries from the AFLOW database with mineral names. We # have also pre-calculated the fingerprints and distances to make this # quicker. def _match_prototype(structure_matcher, s): tags = [] for _, row in self._mineral_db.iterrows(): p = row["structure"] m = structure_matcher.fit_anonymous(p, s) if m: tags.append(_get_row_data(row)) return tags matcher = AflowPrototypeMatcher( initial_ltol=self.initial_ltol, initial_stol=self.initial_stol, initial_angle_tol=self.initial_angle_tol, ) matcher._match_prototype = _match_prototype return matcher.get_prototypes(structure)
[docs] def get_fingerprint_matches( self, structure: IStructure, max_n_matches: int | None = None, match_n_sp: bool = True, mineral_name_constraint: str | None = None, ) -> list[dict[str, Any]] | None: """Gets minerals for a structure by matching to AFLOW fingerprints. Only AFLOW prototypes with mineral names are considered. The AFLOW structure prototypes are detailed in reference [aflow]_. Args: structure: A structure to match. max_n_matches: Maximum number of matches to return. Set to ``None`` to return all matches within the cutoff. match_n_sp: Whether the structure and mineral must have the same number of species. Defaults to True. mineral_name_constraint: Whether to limit the matching to a specific mineral name. Returns: A :obj:`list` of :obj:`dict`, sorted by how close the match is, with the keys 'type', 'distance', 'structure'. Distance is the euclidean distance between the structure and prototype fingerprints. If no match was found within the tolerances, ``None`` will be returned. """ self._set_distance_matrix(structure) mineral_db = self._mineral_db if mineral_name_constraint: mineral_db = mineral_db[ mineral_db["mineral"].str.lower() == mineral_name_constraint ] if match_n_sp: mineral_db = mineral_db[mineral_db["n_elems"] == structure.n_elems] num_rows = mineral_db.shape[0] max_n_matches = max_n_matches or num_rows max_n_matches = min(max_n_matches, num_rows) minerals = [ _get_row_data(row) for i, row in islice(mineral_db.iterrows(), max_n_matches) if row["distance"] < self.fingerprint_distance_cutoff ] return minerals if minerals else None
def _set_distance_matrix(self, structure: IStructure): """Utility func to calculate distance between structure and minerals. First checks to see if the distances have already been calculated for the structure. If not, the distances are stored in a class variable for use by other class methods. Args: structure: A structure. """ if self._structure == structure and self._mineral_db is not None: return data = self.mineral_db.copy() fingerprint = get_structure_fingerprint(structure) if np.linalg.norm(fingerprint) < 0.4: # fingerprint is too small for a reasonable match, indicates very # little bonding or small order parameter matches fingerprint = get_structure_fingerprint(structure, prototype_match=False) data["distance"] = data["fingerprint"].apply( lambda x: get_fingerprint_distance(x, fingerprint) ) self._mineral_db = data.sort_values(by="distance") self._structure = structure
def _get_row_data(row: dict) -> dict[str, Any]: """Utility function to extract mineral data from pandas `DataFrame` row.""" return { "type": row["mineral"], "distance": row["distance"], "structure": row["structure"], }