import pandas as pd
import re
import requests

# Initialize a list to store matched records
matched_records = []

def extract_mentions(text, gene, drug):
    """
    Check if both gene and drug are mentioned in the text.
    
    Args:
        text (str): The abstract to search within.
        gene (str): The gene keyword to search for.
        drug (str): The drug keyword to search for.

    Returns:
        str: 'yes' if both keywords are found, otherwise None.
    """
    gene_pattern = re.escape(gene)
    drug_pattern = re.escape(drug)
    
    if re.search(r'\b{}\b'.format(gene_pattern), text, flags=re.IGNORECASE) and \
       re.search(r'\b{}\b'.format(drug_pattern), text, flags=re.IGNORECASE):
        return "yes"

def get_article_info(title):
    """
    Retrieve article information from CrossRef API if fields are missing.
    
    Args:
        title (str): The title of the article.

    Returns:
        dict: A dictionary with title, abstract, DOI, and date.
    """
    url = "https://api.crossref.org/works"
    params = {
        "query.title": title,
        "rows": 1
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        if data['message']['items']:
            item = data['message']['items'][0]
            title = item.get('title', [''])[0]
            abstract = item.get('abstract', '')
            doi = item.get('DOI', 'No DOI available')
            date_parts = item.get('published-print', item.get('published-online', {})).get('date-parts', [[None]])
            if len(date_parts[0]) > 1:
                date = f"{date_parts[0][0]}-{date_parts[0][1]:02}-{date_parts[0][2]:02}" if len(date_parts[0]) == 3 else f"{date_parts[0][0]}-{date_parts[0][1]:02}"
            else:
                date = ''
            return {'Title': title, 'Abstract': abstract, 'DOI': doi, 'Date': date}
    return {'Title': title, 'Abstract': '', 'DOI': 'No DOI available', 'Date': ''}

# Load the dataset
data = pd.read_csv('107k_details_1.tsv', sep='\t')

# Iterate through each row in the dataset
for index, row in data.iterrows():
    gene = row['Gene']
    drugs = row['Drug'].split(', ')
    abstract = row['Abs'] if isinstance(row['Abs'], str) else 'no'
    
    for drug in drugs:
        if extract_mentions(abstract, gene, drug):
            record = {
                'PMID': row['PMID'],
                'Gene': gene,
                'Drug': drug,
                'Title': row['Title'],
                'Abstract': abstract,            
                'DOI': row['Doi'],
                'Date': row['Date'],
            }
            # Fetch missing details from CrossRef if necessary
            if pd.isnull(row['Title']) or pd.isnull(row['Doi']) or pd.isnull(row['Date']):
                fetched_info = get_article_info(row['Title'])
                record.update(fetched_info)
            matched_records.append(record)
        else:
            # Attempt with modified gene format (replacing first hyphen with slash)
            modified_gene = gene.replace('-', '/', 1)
            if extract_mentions(abstract, modified_gene, drug):
                record = {
                    'PMID': row['PMID'],
                    'Gene': gene,
                    'Drug': drug,
                    'Title': row['Title'],
                    'Abstract': abstract,            
                    'DOI': row['Doi'],
                    'Date': row['Date'],
                }
                # Fetch missing details from CrossRef if necessary
                if pd.isnull(row['Title']) or pd.isnull(row['Doi']) or pd.isnull(row['Date']):
                    fetched_info = get_article_info(row['Title'])
                    record.update(fetched_info)
                matched_records.append(record)

# Convert the matched records into a DataFrame and save to a TSV file
matched_df = pd.DataFrame(matched_records)
matched_df.to_csv('107k_mentions_in_abstract.tsv', sep='\t', index=False)