Close Menu
    Facebook X (Twitter) Instagram
    Articles Stock
    • Home
    • Technology
    • AI
    • Pages
      • About ArticlesStock — AI & Technology Journalist
      • Contact us
      • Disclaimer For Articles Stock
      • Privacy Policy
      • Terms and Conditions
    Facebook X (Twitter) Instagram
    Articles Stock
    AI

    A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Device Use RAG and LoRA Positive-Tuning

    Naveed AhmadBy Naveed Ahmad21/04/2026Updated:21/04/2026No Comments2 Mins Read
    blog 58


    import subprocess, sys, os, shutil, glob
    
    
    def pip_install(args):
       subprocess.run([sys.executable, "-m", "pip", "install", "-q", *args],
                      examine=True)
    
    
    pip_install(["huggingface_hub>=0.26,<1.0"])
    
    
    pip_install([
       "-U",
       "transformers>=4.49,<4.57",
       "accelerate>=0.33.0",
       "bitsandbytes>=0.43.0",
       "peft>=0.11.0",
       "datasets>=2.20.0,<3.0",
       "sentence-transformers>=3.0.0,<4.0",
       "faiss-cpu",
    ])
    
    
    for p in glob.glob(os.path.expanduser(
           "~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4*")):
       shutil.rmtree(p, ignore_errors=True)
    
    
    for _m in listing(sys.modules):
       if _m.startswith(("transformers", "huggingface_hub", "tokenizers",
                         "speed up", "peft", "datasets",
                         "sentence_transformers")):
           del sys.modules[_m]
    
    
    import json, re, textwrap, warnings, torch
    warnings.filterwarnings("ignore")
    
    
    from transformers import (
       AutoModelForCausalLM,
       AutoTokenizer,
       BitsAndBytesConfig,
       TextStreamer,
       TrainingArguments,
       Coach,
       DataCollatorForLanguageModeling,
    )
    import transformers
    print(f"Utilizing transformers {transformers.__version__}")
    
    
    PHI_MODEL_ID = "microsoft/Phi-4-mini-instruct"
    
    
    assert torch.cuda.is_available(), (
       "No GPU detected. In Colab: Runtime > Change runtime kind > T4 GPU."
    )
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"Loading Phi mannequin (native phi3 arch, no distant code): {PHI_MODEL_ID}n")
    
    
    bnb_cfg = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_compute_dtype=torch.bfloat16,
       bnb_4bit_use_double_quant=True,
    )
    
    
    phi_tokenizer = AutoTokenizer.from_pretrained(PHI_MODEL_ID)
    if phi_tokenizer.pad_token_id is None:
       phi_tokenizer.pad_token = phi_tokenizer.eos_token
    
    
    phi_model = AutoModelForCausalLM.from_pretrained(
       PHI_MODEL_ID,
       quantization_config=bnb_cfg,
       device_map="auto",
       torch_dtype=torch.bfloat16,
    )
    phi_model.config.use_cache = True
    
    
    print(f"n✓ Phi-4-mini loaded in 4-bit. "
         f"GPU reminiscence: {torch.cuda.memory_allocated()/1e9:.2f} GB")
    print(f"  Structure: {phi_model.config.model_type}   "
         f"(utilizing built-in {kind(phi_model).__name__})")
    print(f"  Parameters: ~{sum(p.numel() for p in phi_model.parameters())/1e9:.2f}B")
    
    
    def ask_phi(messages, *, instruments=None, max_new_tokens=512,
               temperature=0.3, stream=False):
       """Single entry level for all Phi-4-mini inference calls under."""
       prompt_ids = phi_tokenizer.apply_chat_template(
           messages,
           instruments=instruments,
           add_generation_prompt=True,
           return_tensors="pt",
       ).to(phi_model.gadget)
    
    
       streamer = (TextStreamer(phi_tokenizer, skip_prompt=True,
                                skip_special_tokens=True)
                   if stream else None)
    
    
       with torch.inference_mode():
           out = phi_model.generate(
               prompt_ids,
               max_new_tokens=max_new_tokens,
               do_sample=temperature > 0,
               temperature=max(temperature, 1e-5),
               top_p=0.9,
               pad_token_id=phi_tokenizer.pad_token_id,
               eos_token_id=phi_tokenizer.eos_token_id,
               streamer=streamer,
           )
       return phi_tokenizer.decode(
           out[0][prompt_ids.shape[1]:], skip_special_tokens=True
       ).strip()
    
    
    def banner(title):
       print("n" + "=" * 78 + f"n  {title}n" + "=" * 78)



    Source link

    Naveed Ahmad

    Naveed Ahmad is a technology journalist and AI writer at ArticlesStock, covering artificial intelligence, machine learning, and emerging tech policy. Read his latest articles.

    Related Posts

    OpenAI says hackers stole some information after newest code safety concern

    14/05/2026

    Cerebras raises $5.5B, kicking off 2026’s IPO season with a bang

    14/05/2026

    Khosla Ventures is betting $10M on Ian Crosby, whose final startup, Bench, imploded

    14/05/2026
    Leave A Reply Cancel Reply

    Categories
    • AI
    Recent Comments
      Facebook X (Twitter) Instagram Pinterest
      © 2026 ThemeSphere. Designed by ThemeSphere.

      Type above and press Enter to search. Press Esc to cancel.