from Bio import Entrez import pandas as pd # Initialize lists to store publication information and unfound PMIDs publication_info_list = [] unfound_pmids = [] Entrez.email = 'uth_tester@tester.com' def fetch_pubmed_info(pmid): """ Fetches publication information (title) from PubMed using the PMID. Args: - pmid (str): PubMed Identifier (PMID) of the publication. Returns: - None: Appends the results to the global publication_info_list and unfound_pmids. """ try: # Fetch publication information from PubMed handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml') record = Entrez.read(handle) # Extract title from the record article = record['PubmedArticle'][0]['MedlineCitation']['Article'] title = str(article.get('ArticleTitle', 'No Title Available')) # Store the fetched information publication_info_list.append({ 'PMID': pmid, 'Title': title }) except Exception as error: # Handle cases where the information couldn't be fetched unfound_pmids.append(pmid) publication_info_list.append({ 'PMID': pmid, 'Title': 'Not Found' }) # Load the TSV file containing PMIDs data = pd.read_csv('107k.tsv', sep='\t', encoding='utf8', names=['PMID', 'Gene', 'Drug'], header=None, index_col=False) # Iterate over each PMID and fetch publication information for count, pmid in enumerate(data['PMID'], start=1): fetch_pubmed_info(pmid) print(f'Processed {count} PMIDs', end='\r', flush=True) # Convert the collected publication information into a DataFrame and save to a TSV file publication_df = pd.DataFrame(publication_info_list) publication_df.to_csv('107k_details_cleaned.tsv', sep='\t', index=False) # Save unfound PMIDs to a separate file with open('unfound_pmids.txt', 'w') as file: for pmid in unfound_pmids: file.write(f'{pmid}\n')