import pandas as pd import re import requests # Initialize a list to store matched records matched_records = [] def extract_mentions(text, gene, drug): """ Check if both gene and drug are mentioned in the text. Args: text (str): The abstract to search within. gene (str): The gene keyword to search for. drug (str): The drug keyword to search for. Returns: str: 'yes' if both keywords are found, otherwise None. """ gene_pattern = re.escape(gene) drug_pattern = re.escape(drug) if re.search(r'\b{}\b'.format(gene_pattern), text, flags=re.IGNORECASE) and \ re.search(r'\b{}\b'.format(drug_pattern), text, flags=re.IGNORECASE): return "yes" def get_article_info(title): """ Retrieve article information from CrossRef API if fields are missing. Args: title (str): The title of the article. Returns: dict: A dictionary with title, abstract, DOI, and date. """ url = "https://api.crossref.org/works" params = { "query.title": title, "rows": 1 } response = requests.get(url, params=params) if response.status_code == 200: data = response.json() if data['message']['items']: item = data['message']['items'][0] title = item.get('title', [''])[0] abstract = item.get('abstract', '') doi = item.get('DOI', 'No DOI available') date_parts = item.get('published-print', item.get('published-online', {})).get('date-parts', [[None]]) if len(date_parts[0]) > 1: date = f"{date_parts[0][0]}-{date_parts[0][1]:02}-{date_parts[0][2]:02}" if len(date_parts[0]) == 3 else f"{date_parts[0][0]}-{date_parts[0][1]:02}" else: date = '' return {'Title': title, 'Abstract': abstract, 'DOI': doi, 'Date': date} return {'Title': title, 'Abstract': '', 'DOI': 'No DOI available', 'Date': ''} # Load the dataset data = pd.read_csv('107k_details_1.tsv', sep='\t') # Iterate through each row in the dataset for index, row in data.iterrows(): gene = row['Gene'] drugs = row['Drug'].split(', ') abstract = row['Abs'] if isinstance(row['Abs'], str) else 'no' for drug in drugs: if extract_mentions(abstract, gene, drug): record = { 'PMID': row['PMID'], 'Gene': gene, 'Drug': drug, 'Title': row['Title'], 'Abstract': abstract, 'DOI': row['Doi'], 'Date': row['Date'], } # Fetch missing details from CrossRef if necessary if pd.isnull(row['Title']) or pd.isnull(row['Doi']) or pd.isnull(row['Date']): fetched_info = get_article_info(row['Title']) record.update(fetched_info) matched_records.append(record) else: # Attempt with modified gene format (replacing first hyphen with slash) modified_gene = gene.replace('-', '/', 1) if extract_mentions(abstract, modified_gene, drug): record = { 'PMID': row['PMID'], 'Gene': gene, 'Drug': drug, 'Title': row['Title'], 'Abstract': abstract, 'DOI': row['Doi'], 'Date': row['Date'], } # Fetch missing details from CrossRef if necessary if pd.isnull(row['Title']) or pd.isnull(row['Doi']) or pd.isnull(row['Date']): fetched_info = get_article_info(row['Title']) record.update(fetched_info) matched_records.append(record) # Convert the matched records into a DataFrame and save to a TSV file matched_df = pd.DataFrame(matched_records) matched_df.to_csv('107k_mentions_in_abstract.tsv', sep='\t', index=False)