You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
2.9 KiB
Rust
89 lines
2.9 KiB
Rust
use std::fs::File;
|
|
use std::io::{self, BufWriter, SeekFrom};
|
|
use std::io::prelude::*;
|
|
use std::path::PathBuf;
|
|
use crate::index::InMemoryIndex;
|
|
use crate::tmp::TmpDir;
|
|
use byteorder::{LittleEndian, WriteBytesExt};
|
|
|
|
/// Writer for saving an index to a binary file.
|
|
///
|
|
/// The first 8 bytes of the index file contain the offset of the table of
|
|
/// contents, in bytes. Then come the main entries, all stored back-to-back
|
|
/// with no particular metadata.
|
|
///
|
|
|
|
/// An index file has two parts. The main part of the file is a sequence of
|
|
/// entries, stored back-to-back; the
|
|
pub struct IndexFileWriter {
|
|
/// The number of bytes written so far.
|
|
offset: u64,
|
|
|
|
/// The open file we're writing to.
|
|
writer: BufWriter<File>,
|
|
|
|
/// The table of contents for this file.
|
|
contents_buf: Vec<u8>
|
|
}
|
|
|
|
impl IndexFileWriter {
|
|
pub fn new(mut f: BufWriter<File>) -> io::Result<IndexFileWriter> {
|
|
const HEADER_SIZE: u64 = 8;
|
|
f.write_u64::<LittleEndian>(0)?;
|
|
Ok(IndexFileWriter {
|
|
offset: HEADER_SIZE,
|
|
writer: f,
|
|
contents_buf: vec![]
|
|
})
|
|
}
|
|
|
|
pub fn write_main(&mut self, buf: &[u8]) -> io::Result<()> {
|
|
self.writer.write_all(buf)?;
|
|
self.offset += buf.len() as u64;
|
|
Ok(())
|
|
}
|
|
|
|
pub fn write_contents_entry(&mut self, term: String, df: u32, offset: u64, nbytes: u64) {
|
|
self.contents_buf.write_u64::<LittleEndian>(offset).unwrap();
|
|
self.contents_buf.write_u64::<LittleEndian>(nbytes).unwrap();
|
|
self.contents_buf.write_u32::<LittleEndian>(df).unwrap();
|
|
let bytes = term.bytes();
|
|
self.contents_buf.write_u32::<LittleEndian>(bytes.len() as u32).unwrap();
|
|
self.contents_buf.extend(bytes);
|
|
}
|
|
|
|
/// Finish writing the index file and close it.
|
|
pub fn finish(mut self) -> io::Result<()> {
|
|
let contents_start = self.offset;
|
|
self.writer.write_all(&self.contents_buf)?;
|
|
println!("{} bytes main, {} bytes total", contents_start, contents_start + self.contents_buf.len() as u64);
|
|
self.writer.seek(SeekFrom::Start(0))?;
|
|
self.writer.write_u64::<LittleEndian>(contents_start)?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
pub fn write_index_to_tmp_file(index: InMemoryIndex, tmp_dir: &mut TmpDir) -> io::Result<PathBuf> {
|
|
let (filename, f) = tmp_dir.create()?;
|
|
let mut writer = IndexFileWriter::new(f)?;
|
|
|
|
// The merge algorithm requires the entries within each file to be sorted by term.
|
|
// Sort before writing anything.
|
|
let mut index_as_vec: Vec<_> = index.map.into_iter().collect();
|
|
index_as_vec.sort_by(|&(ref a, _), &(ref b, _)| a.cmp(b));
|
|
|
|
for (term, hits) in index_as_vec {
|
|
let df = hits.len() as u32;
|
|
let start = writer.offset;
|
|
for buffer in hits {
|
|
writer.write_main(&buffer)?;
|
|
}
|
|
let stop = writer.offset;
|
|
writer.write_contents_entry(term, df, start, stop - start);
|
|
}
|
|
|
|
writer.finish()?;
|
|
println!("wrote file {:?}", filename);
|
|
Ok(filename)
|
|
}
|