Extract Drugs

This module extracts drugs from the records.

Overview

This module extracts drugs from the records.

It utilizes the drug columns (in order) listed in the config file (config.json).

It also requires you to have the drug extraction tool installed.

`command(input_fpath, target_columns)`

Build the command to run the drug extraction tool.

Parameters:

Name	Type	Description	Default
`input_fpath`	`str`	path to the input file	required
`target_columns`	`list[str] \| str`	the column(s) to search	required

Returns:

Type	Description
`list[str]`	list[str]: the command (list) to run

Source code in src/opendata_pipeline/extract_drugs.py

def command(input_fpath: str, target_columns: list[str] | str) -> list[str]:
    """Build the command to run the drug extraction tool.

    Args:
        input_fpath (str): path to the input file
        target_columns (list[str] | str): the column(s) to search

    Returns:
        list[str]: the command (list) to run
    """
    cmd = [
        "uvx",
        "extract-drugs",
        "search",
        "--data-file",
        input_fpath,
        "--id-col",
        # we made this column when we fetched the data
        "CaseIdentifier",
    ]
    if isinstance(target_columns, str):
        cmd.extend(["--search-cols", target_columns])
    else:
        for col in target_columns:
            cmd.extend(["--search-cols", col])
    return cmd

`export_drug_output(drug_results)`

Export the drug output to a file.

Source code in src/opendata_pipeline/extract_drugs.py

def export_drug_output(drug_results: list[dict[str, Any]]) -> None:
    """Export the drug output to a file."""
    with open(Path("data") / "drug_data.jsonl", "w") as f:
        for record in drug_results:
            f.write(orjson.dumps(record).decode("utf-8") + "\n")

    (
        pd.DataFrame(drug_results)
        .rename(columns={"row_id": "CaseIdentifier"})
        .to_csv(Path("data") / "drug_output.csv", index=False)
    )

`fetch_drug_search_terms()`

Fetch drug search terms from the remote github repo.

Returns:

Type	Description
`None`	dict[str, str]: a dictionary of search terms and their tags

Source code in src/opendata_pipeline/extract_drugs.py

def fetch_drug_search_terms() -> None:
    """Fetch drug search terms from the remote github repo.

    Returns:
        dict[str, str]: a dictionary of search terms and their tags
    """
    console.log("Fetching drug search terms from GitHub")
    url = "https://raw.githubusercontent.com/UK-IPOP/drug-extraction/main/data/search_terms.csv"
    resp = requests.get(url)
    # data = resp.json()
    Path("search_terms.csv").write_text(resp.text)

`read_drug_output(source)`

Read the drug output file and yield each record.

Source code in src/opendata_pipeline/extract_drugs.py

def read_drug_output(source: str) -> Generator[dict[str, str], None, None]:
    """Read the drug output file and yield each record."""
    with open(Path().cwd() / "output.csv", "r") as f:
        reader = csv.DictReader(f)
        for line in reader:
            line["data_source"] = source
            yield line

`run(settings)`

Run the drug extraction tool.

Source code in src/opendata_pipeline/extract_drugs.py

def run(settings: models.Settings) -> None:
    """Run the drug extraction tool."""
    fetch_drug_search_terms()

    drug_results: list[dict[str, Any]] = []
    for data_source in settings.sources:
        results = run_drug_tool(config=data_source)
        drug_results.extend(results)

    console.log("Exporting drug data...")
    export_drug_output(drug_results=drug_results)

`run_drug_tool(config)`

Run the drug extraction tool.

Parameters:

Name	Type	Description	Default
`config`	`DataSource`	the data source config	required

Returns:

Type	Description
`list[dict[str, Any]]`	list[dict[str, Any]]: the drug results

Source code in src/opendata_pipeline/extract_drugs.py

def run_drug_tool(config: models.DataSource) -> list[dict[str, Any]]:
    """Run the drug extraction tool.

    Args:
        config (models.DataSource): the data source config

    Returns:
        list[dict[str, Any]]: the drug results
    """
    drug_results: list[dict[str, Any]] = []

    in_file = Path("data") / config.drug_prep_filename
    cmd = command(
        input_fpath=in_file.as_posix(),
        target_columns=config.drug_columns,
    )
    console.log(
        f"Running drug extraction tool on {config.drug_columns} for {config.name}"
    )
    # output is written to file
    subprocess.run(cmd)
    # so now we read the file using generators
    # could code this better
    for record in read_drug_output(config.name):
        drug_results.append(record)

    return drug_results