Source code for ml.process

"""Process ab-initio molecular dynamics simulations."""

import glob
import os
import io
import time
import pandas as pd
import numpy as np
import MDAnalysis as mda
from typing import List
from biopandas.pdb import PandasPdb
from MDAnalysis.analysis import distances
from itertools import combinations

[docs]def check_file_exists(filename): """ Check if a file with the given name exists. If it does not exist, the function ends the current Python session with an informative error message. Parameters ---------- filename : str The name of the file to check for existence. """ # Check if the file exists if not os.path.isfile(filename): # Raise a FileNotFoundError with a custom error message raise FileNotFoundError( f" > File '{filename}' does not exist. Please provide a valid file name." ) else: print(f" > File '{filename}' exists.") return None
[docs]def combine_sp_xyz(): """ Combines single point xyz's for all replicates. The QM single points each of a geometry file. Combines all those xyz files into. Preferential to using the other geometry files to insure they are identical. Returns ------- replicate_info : List[tuple()] List of tuples with replicate number and frame count for the replicates. """ start_time = time.time() # Used to report the executation speed # Get the directories of each replicate primary = os.getcwd() replicates = sorted(glob.glob("*/")) ignore = ["Analyze/", "Analysis/", "coordinates/", "inputfiles/", "opt-wfn/"] xyz_count = 0 replicate_info = [] # Initialize an empty list to store replicate information # Get the name of the structure geometry_name = os.getcwd().split("/")[-1] out_file = f"{geometry_name}_geometry.xyz" with open(out_file, "w") as combined_sp: for replicate in replicates: if replicate in ignore: continue else: print(f" > Adding replicate {replicate} structures.") os.chdir(replicate) secondary = os.getcwd() os.chdir("coordinates") structures = sorted(glob.glob("*.xyz")) frame_count = 0 # Initialize frame count for each replicate for index, structure in enumerate(structures): with open(structure, "r") as file: # Write the header from the first file combined_sp.writelines(file.readlines()) xyz_count += 1 frame_count += 1 replicate_info.append( (replicate[:-1], frame_count) ) # Append replicate information # Go back and loop through all the other replicates os.chdir(primary) total_time = round(time.time() - start_time, 3) # Time to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tOUTPUT: Combined {xyz_count} single point xyz files. \tOUTPUT: Output file is {out_file}. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ ) return replicate_info
[docs]def combine_qm_replicates() -> None: """ Combine the all_charges.xls files for replicates into a master charge file. The combined file contains a single header with atom numbers as columns. Each row represents a new charge instance. The first column indicates which replicate the charge came from. """ start_time = time.time() # Used to report the executation speed charge_file = "all_charges.xls" ignore = ["Analysis/"] # Directory containing all replicates primary_dir = os.getcwd() directories = sorted(glob.glob("*/")) replicates = [i for i in directories if i not in ignore] replicate_count = len(replicates) # Report to user # Remove any old version because we are appending if os.path.exists(charge_file): os.remove(charge_file) print(f" > Deleting old {charge_file}.") # Create a new file to save charges with open(charge_file, "a") as new_charge_file: header_written = False for replicate in replicates: # There will always be an Analysis folder os.chdir(replicate) secondary_dir = os.getcwd() print(f" > Adding {secondary_dir}") # Get the replicate number from the folder name replicate_number = os.path.basename(os.path.normpath(secondary_dir)) # Add the header for the first replicate with open(charge_file, "r") as current_charge_file: for index, line in enumerate(current_charge_file): if index == 0: if not header_written: new_charge_file.writelines(line.strip() + "\treplicate\n") header_written = True continue elif "nan" in line: print(f" > Found nan values in {secondary_dir}.") else: new_charge_file.writelines( line.strip() + "\t" + replicate_number + "\n" ) os.chdir(primary_dir) total_time = round(time.time() - start_time, 3) # Seconds to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tRESULT: Combined charges across {replicate_count} replicates. \tOUTPUT: Generated {charge_file} in the current directory. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ )
[docs]def combine_replicates( all_charges: str = "all_charges.xls", all_coors: str = "all_coors.xyz" ) -> None: """ Collects charges or coordinates into a xls and xyz file across replicates. Parameters ---------- all_charges : str The name of the file containing all charges in xls format. all_coors.xyz : str The name of the file containing the coordinates in xyz format. Notes ----- Run from the directory that contains the replicates. Run combine_restarts first for if each replicated was run across multiple runs. Generalized to combine any number of replicates. See Also -------- qa.process.combine_restarts: Combines restarts and should be run first. """ # General variables start_time = time.time() # Used to report the executation speed files = [all_charges, all_coors] # Files to be concatonated charge_files: list[str] = [] # List of the charge file locations coors_files: list[str] = [] # List of the coors file locations root = os.getcwd() dirs = sorted(glob.glob(f"{root}/*/")) # glob to efficiently grab only dirs replicates = len(dirs) # Only used to report to user # Loop through all directories containing replicates for dir in dirs: if os.path.isfile(f"{dir}{files[0]}") and os.path.isfile(f"{dir}{files[1]}"): charge_files.append(f"{dir}{files[0]}") coors_files.append(f"{dir}{files[1]}") new_file_names = [f"raw_{all_charges}", all_coors] file_locations = [charge_files, coors_files] # Loop over the file names and their locations for file_name, file_location in zip(new_file_names, file_locations): # Open a new file where we will write the concatonated output with open(file_name, "wb") as outfile: for loc in file_location: with open(loc, "rb") as infile: shutil.copyfileobj(infile, outfile) # The combined charge file now has multiple header lines first_line = True with open(new_file_names[0], "r") as raw_charge_file: with open(files[0], "w") as clean_charge_file: for line in raw_charge_file: # We want the first line to have the header if first_line == True: clean_charge_file.write(line) first_line = False # After the first, no lines should contain atom names else: if "H" in line: continue else: clean_charge_file.write(line) # Delete the charge file with the extra headers to keep the dir clean os.remove(new_file_names[0]) total_time = round(time.time() - start_time, 3) # Seconds to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tRESULT: Combined {replicates} replicates. \tOUTPUT: Generated {files[0]} and {files[1]} in the current directory. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ )
[docs]def combine_qm_charges(first_job: int, last_job: int, step: int) -> None: """ Combines the charge_mull.xls files generate by TeraChem single points. After running periodic single points on the ab-initio MD data, we need to process the charge data so that it matches the SQM data. This code gets the charges from each single point and combines them. Results are stored in a tabular form. Parameters ---------- first_job: int The name of the first directory and first job e.g., 0 last_job: int The name of the last directory and last job e.g., 39901 step: int The step size between each single point. """ start_time = time.time() # Used to report the executation speed new_charge_file = "all_charges.xls" current_charge_file = "charge_mull.xls" ignore = ["Analysis/"] # Directory containing all replicates primary_dir = os.getcwd() directories = sorted(glob.glob("*/")) replicates = [i for i in directories if i not in ignore] replicate_count = len(replicates) # Report to user for replicate in replicates: frames = 0 # Saved to report to the user os.chdir(replicate) # The location of the current qm job that we are appending secondary_dir = os.getcwd() print(f" > Adding { secondary_dir}") # Create a new file where we will store the combined charges first_charges_file = True # We need the title line but only once if os.path.exists(new_charge_file): os.remove(new_charge_file) # Since appending remove old version print(f" > Deleting old {secondary_dir}/{new_charge_file}.") with open(new_charge_file, "a") as combined_charges_file: # A list of all job directories assuming they are named as integers job_dirs = [str(dir) for dir in range(first_job, last_job, step)] # Change into one of the QM job directories for index, dir in enumerate(job_dirs): os.chdir(dir) tertiary_dir = os.getcwd() os.chdir("scr") # Open an individual charge file from a QM single point atom_column = [] charge_column = [] # Open one of the QM charge single point files with open(current_charge_file, "r") as charges_file: # Separate the atom and charge information for line in charges_file: clean_line = line.strip().split("\t") charge_column.append(clean_line[1]) atom_column.append(clean_line[0]) # Join the data and separate it with tabs charge_line = "\t".join(charge_column) # For some reason, TeraChem indexes at 0 with SQM, # and 1 with QM so we change the index to start at 1 atoms_line_reindex = [] for atom in atom_column: atom_list = atom.split() atom_list[0] = str(int(atom_list[0]) - 1) x = " ".join(atom_list) atoms_line_reindex.append(x) atom_line = "\t".join(atoms_line_reindex) # Append the data to the combined charges data file # We only add the header line once if first_charges_file: combined_charges_file.write(f"{atom_line}\n") combined_charges_file.write(f"{charge_line}\n") frames += 1 first_charges_file = False # Skip the header if it has already been added else: if "nan" in charge_line: print(f" > Found nan values in {index * 100}!!") combined_charges_file.write(f"{charge_line}\n") frames += 1 os.chdir(secondary_dir) print(f" > Combined {frames} frames.") os.chdir(primary_dir) total_time = round(time.time() - start_time, 3) # Seconds to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tRESULT: Combined charges across {replicate_count} replicates. \tOUTPUT: Generated {new_charge_file} in the current directory. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ )
[docs]def xyz2pdb_traj() -> None: """ Converts an xyz trajectory file into a pdb trajectory file. Note ---- Make sure to manually check the PDB that is read in. Assumes no header lines. Assumes that the only TER flag is at the end. """ start_time = time.time() # Used to report the executation speed # Get the name of the structure pdb_name = "template.pdb" geometry_name = os.getcwd().split("/")[-1] xyz_name = f"{geometry_name}_geometry.xyz" new_pdb_name = f"{geometry_name}_geometry.pdb" # Open files for reading xyz_file = open(xyz_name, "r").readlines() pdb_file = open(pdb_name, "r").readlines() max_atom = int(pdb_file[len(pdb_file) - 3].split()[1]) new_file = open(new_pdb_name, "w") atom = -1 # Start at -1 to skip the XYZ header line_count = 0 for line in xyz_file: line_count += 1 if atom > 0: atom += 1 try: x, y, z = line.strip("\n").split()[1:5] # Coordinates from xyz file element_name = line.strip("\n").split()[0] except: print(f"> Script died at {line_count} -> '{line}'") print("> Check if you remembered to add a TER line at the end of your PDB") quit() pdb_line = pdb_file[atom - 2].strip() # PDB is two behind the xyz if len(element_name) > 1: new_file.write( f"{pdb_line[0:30]}{x[0:6]} {y[0:6]} {z[0:6]} {pdb_line[54:80]} {element_name}\n" ) else: new_file.write( f"{pdb_line[0:30]}{x[0:6]} {y[0:6]} {z[0:6]} {pdb_line[54:80]} {element_name}\n" ) else: atom += 1 if atom > max_atom: atom = -1 new_file.write("END\n") total_time = round(time.time() - start_time, 3) # Seconds to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tRESULT: Converted {xyz_name} to {new_pdb_name}. \tOUTPUT: Generated {new_pdb_name} in the current directory. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ )
[docs]def pairwise_distances_csv(pdb_traj_path, output_file, replicate_info, remove=[]): """ Calculate pairwise distances between residue centers of mass and save the result to a CSV file. Parameters ---------- pdb_traj_path : str The file path of the PDB trajectory file. output_file : str The name of the output CSV file. remove : list of ints A list of integars corresponding to residues indexed at zero to drop """ start_time = time.time() # Used to report the executation speed # Read the trajectory file and split it into models with open(pdb_traj_path) as f: models = f.read().split("END") # Create a list of StringIO objects for each model frame_files = [io.StringIO(model) for model in models if model.strip()] universes = [mda.Universe(frame_file, format="pdb") for frame_file in frame_files] # Generate column names based on residue pairs residue_names = [ residue.resname + str(residue.resid) for residue in universes[0].residues ] # Filter out residues that user wants to remove for idx in sorted(remove, reverse=True): del residue_names[idx] # Regenerate column names based on filtered residue pairs residue_pairs = list(combinations(residue_names, 2)) column_names = [f"{pair[0]}-{pair[1]}" for pair in residue_pairs] pairwise_distances = [] for universe in universes: # Calculate the center of mass for each residue, skipping the residues to remove residue_com = np.array( [residue.atoms.center_of_mass() for i, residue in enumerate(universe.residues) if i not in remove] ) # Calculate the pairwise distance matrix distance_matrix = distances.distance_array(residue_com, residue_com) pairwise_distances.append( distance_matrix[np.triu_indices(len(residue_com), k=1)] ) # Create a DataFrame with pairwise distances and column names pairwise_distances_df = pd.DataFrame(pairwise_distances, columns=column_names) # Processes replicate info and add it as the last column replicate_list = [] for replicate, count in replicate_info: replicate_list.extend([replicate] * count) replicate_col = pd.DataFrame(replicate_list, columns=["replicate"]) # Ensure the replicate_col has the same number of rows as the other dataframe. assert len(pairwise_distances_df) == len( replicate_col ), "Dataframes have different number of rows." # Concatenate the dataframes along the columns axis pairwise_distances_df = pd.concat([pairwise_distances_df, replicate_col], axis=1) pairwise_distances_df.to_csv(output_file, index=False) num_pairwise_distances = (pairwise_distances_df.shape[1] - 1) total_time = round(time.time() - start_time, 3) # Seconds to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tRESULT: Created {num_pairwise_distances} pairwise distance data set from xyz's. \tOUTPUT: Save pairwise distance data to {output_file}. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ )
[docs]def pairwise_charge_features(structure, charge_data): """ Generate pairwise charge features for a given metalloenzyme structure. This function reads charge data from a CSV file, computes pairwise charge features using the specified operation (addition or multiplication), and saves the results as a new CSV file. The input CSV file should contain columns with charges for each amino acid in the metalloenzyme, with each row representing a frame from an ab-initio molecular dynamics simulation. The last column should be named "replicate" and contain information about the replicate each row belongs to. Parameters ---------- structure : str The name of the metalloenzyme structure, used to find the input CSV file (named "{structure}_charges.csv") and to name the output CSV file (named "{structure}_charges_pairwise_{operation}.csv"). Raises ------ ValueError If the user inputs an invalid operation for pairwise charge features. Valid operations are "add" and "multiply". """ # Load the data df = pd.read_csv(charge_data) # Remove the "replicate" column and store it separately replicate = df.pop("replicate") # Prompt the user for the desired operation operation = input("Operation for pairwise charges Add (a) or Multiply (m)? ") # Generate pairwise charge features feature_columns = df.columns new_features = [] for i, col1 in enumerate(feature_columns): for j, col2 in enumerate(feature_columns): if j <= i: # Skip duplicate pairs continue # Perform the specified operation if operation == "a": new_features.append( pd.DataFrame({f"{col1}-{col2}": df[col1] + df[col2]}) ) elif operation == "m": new_features.append( pd.DataFrame({f"{col1}-{col2}": df[col1] * df[col2]}) ) else: raise ValueError("Invalid operation. Choose 'add' or 'multiply'.") # Concatenate the original DataFrame and new DataFrame charge_pairs_df = pd.concat(new_features, axis=1) # Add the "replicate" column back in as the final column charge_pairs_df = pd.concat([charge_pairs_df, replicate], axis=1) # Save the results as a new CSV operation = {"a": "add", "m": "multiply"}.get(operation, operation) output_file = f"{structure}_charges_pairwise_{operation}.csv" charge_pairs_df.to_csv(output_file, index=False) return charge_pairs_df
[docs]def get_residue_identifiers(template, by_atom=True) -> List[str]: """ Gets the residue identifiers such as Ala1 or Cys24. Returns either the residue identifiers for every atom, if by_atom = True or for just the unique amino acids if by_atom = False. Parameters ---------- template: str The name of the template pdb for the protein of interest. by_atom: bool A boolean value for whether to return the atom identifiers for all atoms Returns ------- residues_indentifier: List(str) A list of the residue identifiers """ # Get the residue number identifiers (e.g., 1) residue_number = ( PandasPdb().read_pdb(template).df["ATOM"]["residue_number"].tolist() ) # Get the residue number identifiers (e.g., ALA) residue_name = PandasPdb().read_pdb(template).df["ATOM"]["residue_name"].tolist() # Combine them together residues_indentifier = [ f"{name}{number}" for number, name in zip(residue_number, residue_name) ] # Return only unique entries if the user sets by_atom = False if not by_atom: residues_indentifier = list(OrderedDict.fromkeys(residues_indentifier)) return residues_indentifier
[docs]def summed_residue_charge(charge_data: pd.DataFrame, template: str): """ Sums the charges for all atoms by residue. Reduces inaccuracies introduced by the limitations of Mulliken charges. Parameters ---------- charge_data: pd.DataFrame A DataFrame containing the charge data. template: str The name of the template pdb for the protein of interest. Returns ------- sum_by_residues: pd.DataFrame The charge data averaged by residue and stored as a pd.DataFrame. """ # Extract the "replicate" column and remove it from the charge_data DataFrame replicate_column = charge_data["replicate"] charge_data = charge_data.drop("replicate", axis=1) # Get the residue identifiers (e.g., 1Ala) for each atom residues_indentifier = get_residue_identifiers(template) # Assign the residue identifiers as the column names of the charge DataFrame charge_data.columns = residues_indentifier sum_by_residues = charge_data.groupby( by=charge_data.columns, sort=False, axis=1 ).sum() # Add the "replicate" column back to the sum_by_residues DataFrame sum_by_residues["replicate"] = replicate_column return sum_by_residues
[docs]def final_charge_dataset( charge_file: str, template: str, mutations: List[int] ) -> pd.DataFrame: """ Create final charge data set. The output from the combined charges is an .xls file with atoms as columns. We will combine the atoms by residues and average the charges. Returns ------- charges_df: pd.DataFrame The original charge data as a pandas dataframe. """ print(f" > Converting atoms to residues for {charge_file}.") # Load the charge file as a DataFrame charge_data = pd.read_csv(charge_file, sep="\t") # Average atoms by residue to minimize the inaccuracies of Mulliken charges avg_by_residues = summed_residue_charge(charge_data, template) # Drop the residue columns that were mutated # We can't compare these residues' charges as their atom counts differ charges_df = avg_by_residues.drop( avg_by_residues.columns[[m for m in mutations]], axis=1 ) # Save the individual dataframe to a CSV file geometry_name = os.getcwd().split("/")[-1] output_file = f"{geometry_name}_charges_ml.csv" charges_df.to_csv(output_file, index=False) print(f" > Saved {output_file}.") return charges_df
[docs]def calculate_esp(component_atoms, scheme): """ Calculate the electrostatic potential (ESP) of a molecular component. Takes the output from a Multiwfn charge calculation and calculates the ESP. Run it from the folder that contains all replicates. It will generate a single csv file with all the charges for your residue, with one component/column specified in the input residue dictionary. Parameters ---------- component_atoms: List[int] A list of the atoms in a given component """ # Physical constants k = 8.987551 * (10**9) # Coulombic constant in kg*m**3/(s**4*A**2) A_to_m = 10 ** (-10) KJ_J = 10**-3 faraday = 23.06 # Kcal/(mol*V) C_e = 1.6023 * (10**-19) one_mol = 6.02 * (10**23) cal_J = 4.184 component_esp_list = [] # Open a charge scheme file as a pandas dataframe file_path = glob.glob(f"*_{scheme}.txt")[0] df_all = pd.read_csv(file_path, sep="\s+", names=["Atom", "x", "y", "z", "charge"]) # The index of the metal center assuming iron Fe metal_index = df_all.index[df_all["Atom"] == "Fe"][0] component_atoms.append(metal_index) # Select rows corresponding to an atoms in the component df = df_all[df_all.index.isin(component_atoms)] df.reset_index(drop=True, inplace=True) # Get the new index of the metal as it will have changed metal_index = df.index[df["Atom"] == "Fe"][0] # Convert columns lists for indexing atoms = df["Atom"] # Now contains only atoms in component charges = df["charge"] xs = df["x"] ys = df["y"] zs = df["z"] # Determine position and charge of the target atom xo = xs[metal_index] yo = ys[metal_index] zo = zs[metal_index] chargeo = charges[metal_index] total_esp = 0 for idx in range(0, len(atoms)): if idx == metal_index: continue else: # Calculate esp and convert to units (A to m) r = ( ((xs[idx] - xo) * A_to_m) ** 2 + ((ys[idx] - yo) * A_to_m) ** 2 + ((zs[idx] - zo) * A_to_m) ** 2 ) ** 0.5 total_esp = total_esp + (charges[idx] / r) # Note that cal/kcal * kJ/J gives 1 component_esp = k * total_esp * ((C_e)) * cal_J * faraday return component_esp
[docs]def collect_esp_components(first_job: int, last_job: int, step: int) -> None: """ Loops over replicates and single points and collects metal-centered ESPs. The main purpose is to navigagt the file structure and collect the data. The computing of the ESP is done in the calculate_esp() function. Parameters ---------- first_job: int The name of the first directory and first job e.g., 0 last_job: int The name of the last directory and last job e.g., 39900 step: int The step size between each single point. See Also -------- qa.process.calculate_esp() """ start_time = time.time() # Used to report the executation speed ignore = ["Analysis/", "coordinates/", "inputfiles/"] charge_schemes = ["ADCH", "Hirshfeld", "Mulliken", "Voronoi"] components = { "all": "1-487", "lower": "1-252", "upper": "253-424", "lower-his": "1-86,104-252", "heme": "425-486", "his": "87-103", } qm_job_count = 0 # Directory containing all replicates primary_dir = os.getcwd() directories = sorted(glob.glob("*/")) replicates = [i for i in directories if i not in ignore] replicate_count = len(replicates) # Report to user # Loop over each charge scheme which will be in the scr/ directory for scheme in charge_schemes: # Create a pandas dataframe with the columns from components keys charge_scheme_df = pd.DataFrame(columns=components.keys()) # Loop over each replicate row_index = 0 for replicate in replicates: os.chdir(replicate) # The location of the current qm job that we are appending secondary_dir = os.getcwd() # A list of all job directories assuming they are named as integers job_dirs = [str(dir) for dir in range(first_job, last_job, step)] # Change into one of the QM job directories for dir in job_dirs: os.chdir(dir) tertiary_dir = os.getcwd() os.chdir("scr/") row_index += 1 # Loop of the values of our dictionary for key, value in components.items(): component_atoms = [] # Convert number strings, with commas and dashes, to numbers for range_str in value.split(","): start, end = map(int, range_str.split("-")) component_atoms.extend(range(start - 1, end)) # Run a function try: component_esp = calculate_esp(component_atoms, scheme) except: print(f"Job: {replicate}-->{dir}") charge_scheme_df.loc[row_index, key] = component_esp # Move back to the QM job directory qm_job_count += 1 os.chdir(secondary_dir) os.chdir(primary_dir) # Save the dataframe to a csv file charge_scheme_df.to_csv(f"{scheme}_esp.csv", index=False) total_time = round(time.time() - start_time, 3) # Time to run the function print( f""" \t----------------------------ALL RUNS END---------------------------- \tRESULT: Performed operation on {replicate_count} replicates. \tOUTPUT: Output files for {qm_job_count} single points. \tTIME: Total execution time: {total_time} seconds. \t--------------------------------------------------------------------\n """ )
[docs]def add_esp_charges(charges_df, esp_scheme, geometry_name): """ Add the "upper" and "lower" columns from a CSV file to a DataFrame by removing and re-adding the "replicates" column. Parameters ---------- charges_df : pandas.DataFrame The DataFrame to which the "upper" and "lower" columns will be added. esp_scheme : str The name of the esp file. geometry_name : str The name of the mimochrome or directory where we are working. Returns ------- pandas.DataFrame The modified DataFrame with the "upper" and "lower" columns added and the "replicates" column re-added at the end. """ # Read the csv file as a DataFrame csv_df = pd.read_csv(f"{esp_scheme}_esp.csv") # Select the "upper" and "lower" columns selected_columns = csv_df[["upper", "lower"]] # Remove the "replicates" column from charges_df and store it separately replicates_column = charges_df.pop("replicate") # Concatenate charges_df and selected_columns charges_df = pd.concat([charges_df, selected_columns], axis=1) # Add the "replicates" column back to charges_df charges_df["replicate"] = replicates_column charges_df.to_csv(f"{geometry_name}_charge_esp_ml.csv", index=False) return charges_df
if __name__ == "__main__": # Execute when run as a script pairwise_distances_csv("path/to/your/pdb_trajectory_file.pdb")