Skip to content

Extract Drugs

This module extracts drugs from the records.

Overview

This module extracts drugs from the records.

It utilizes the drug columns (in order) listed in the config file (config.json).

It also requires you to have the drug extraction tool installed.

command(input_fpath, target_columns)

Build the command to run the drug extraction tool.

Parameters:

Name Type Description Default
input_fpath str

path to the input file

required
target_columns list[str] | str

the column(s) to search

required

Returns:

Type Description
list[str]

list[str]: the command (list) to run

Source code in src/opendata_pipeline/extract_drugs.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def command(input_fpath: str, target_columns: list[str] | str) -> list[str]:
    """Build the command to run the drug extraction tool.

    Args:
        input_fpath (str): path to the input file
        target_columns (list[str] | str): the column(s) to search

    Returns:
        list[str]: the command (list) to run
    """
    cmd = [
        "uvx",
        "extract-drugs",
        "search",
        "--data-file",
        input_fpath,
        "--id-col",
        # we made this column when we fetched the data
        "CaseIdentifier",
    ]
    if isinstance(target_columns, str):
        cmd.extend(["--search-cols", target_columns])
    else:
        for col in target_columns:
            cmd.extend(["--search-cols", col])
    return cmd

export_drug_output(drug_results)

Export the drug output to a file.

Source code in src/opendata_pipeline/extract_drugs.py
102
103
104
105
106
107
108
109
110
111
112
def export_drug_output(drug_results: list[dict[str, Any]]) -> None:
    """Export the drug output to a file."""
    with open(Path("data") / "drug_data.jsonl", "w") as f:
        for record in drug_results:
            f.write(orjson.dumps(record).decode("utf-8") + "\n")

    (
        pd.DataFrame(drug_results)
        .rename(columns={"row_id": "CaseIdentifier"})
        .to_csv(Path("data") / "drug_output.csv", index=False)
    )

fetch_drug_search_terms()

Fetch drug search terms from the remote github repo.

Returns:

Type Description
None

dict[str, str]: a dictionary of search terms and their tags

Source code in src/opendata_pipeline/extract_drugs.py
23
24
25
26
27
28
29
30
31
32
33
def fetch_drug_search_terms() -> None:
    """Fetch drug search terms from the remote github repo.

    Returns:
        dict[str, str]: a dictionary of search terms and their tags
    """
    console.log("Fetching drug search terms from GitHub")
    url = "https://raw.githubusercontent.com/UK-IPOP/drug-extraction/main/data/search_terms.csv"
    resp = requests.get(url)
    # data = resp.json()
    Path("search_terms.csv").write_text(resp.text)

read_drug_output(source)

Read the drug output file and yield each record.

Source code in src/opendata_pipeline/extract_drugs.py
64
65
66
67
68
69
70
def read_drug_output(source: str) -> Generator[dict[str, str], None, None]:
    """Read the drug output file and yield each record."""
    with open(Path().cwd() / "output.csv", "r") as f:
        reader = csv.DictReader(f)
        for line in reader:
            line["data_source"] = source
            yield line

run(settings)

Run the drug extraction tool.

Source code in src/opendata_pipeline/extract_drugs.py
115
116
117
118
119
120
121
122
123
124
125
def run(settings: models.Settings) -> None:
    """Run the drug extraction tool."""
    fetch_drug_search_terms()

    drug_results: list[dict[str, Any]] = []
    for data_source in settings.sources:
        results = run_drug_tool(config=data_source)
        drug_results.extend(results)

    console.log("Exporting drug data...")
    export_drug_output(drug_results=drug_results)

run_drug_tool(config)

Run the drug extraction tool.

Parameters:

Name Type Description Default
config DataSource

the data source config

required

Returns:

Type Description
list[dict[str, Any]]

list[dict[str, Any]]: the drug results

Source code in src/opendata_pipeline/extract_drugs.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def run_drug_tool(config: models.DataSource) -> list[dict[str, Any]]:
    """Run the drug extraction tool.

    Args:
        config (models.DataSource): the data source config

    Returns:
        list[dict[str, Any]]: the drug results
    """
    drug_results: list[dict[str, Any]] = []

    in_file = Path("data") / config.drug_prep_filename
    cmd = command(
        input_fpath=in_file.as_posix(),
        target_columns=config.drug_columns,
    )
    console.log(
        f"Running drug extraction tool on {config.drug_columns} for {config.name}"
    )
    # output is written to file
    subprocess.run(cmd)
    # so now we read the file using generators
    # could code this better
    for record in read_drug_output(config.name):
        drug_results.append(record)

    return drug_results