chore: make code more idiomatic

Signed-off-by: simonsan <14062932+simonsan@users.noreply.github.com>
pull/5/head
simonsan 3 months ago
parent e538dfe98b
commit 12031aa1ee
No known key found for this signature in database
GPG Key ID: E11D13668EC3B71B

@ -32,8 +32,8 @@ fn main() {
"filenames",
Collect,
"Names of files/directories to index. \
For directories, all .txt files immediately \
under the directory are indexed.",
For directories, all .txt files immediately \
under the directory are indexed.",
);
ap.parse_args_or_exit();
}

@ -21,6 +21,7 @@ fn tokenize(text: &str) -> Vec<&str> {
/// answer simple search queries. And you can use the `read`, `write`, and
/// `merge` modules to save an in-memory index to disk and merge it with other
/// indices, producing a large index.
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct InMemoryIndex {
/// The total number of words in the indexed documents.
pub word_count: usize,
@ -48,10 +49,7 @@ pub type Hit = Vec<u8>;
impl InMemoryIndex {
/// Create a new, empty index.
pub fn new() -> Self {
Self {
word_count: 0,
map: HashMap::new(),
}
Self::default()
}
/// Index a single document.

@ -1,22 +1,39 @@
use std::fs::{self, File};
use std::io::{self, BufWriter};
use std::mem;
use std::path::{Path, PathBuf};
use std::{fmt, mem};
use std::{
fmt::Debug,
fs::{self, File},
};
use std::{
fmt::Formatter,
io::{self, BufWriter},
};
use crate::read::IndexFileReader;
use crate::tmp::TmpDir;
use crate::write::IndexFileWriter;
pub(crate) mod constants {
// How many files to merge at a time, at most.
pub const NSTREAMS: usize = 8;
pub const MERGED_FILENAME: &str = "index.dat";
}
#[derive(Clone)]
pub struct FileMerge {
output_dir: PathBuf,
tmp_dir: TmpDir,
stacks: Vec<Vec<PathBuf>>,
}
// How many files to merge at a time, at most.
const NSTREAMS: usize = 8;
const MERGED_FILENAME: &str = "index.dat";
impl Debug for FileMerge {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("FileMerge")
.field("output_dir", &self.output_dir)
.field("stacks", &self.stacks)
.finish()
}
}
impl FileMerge {
pub fn new(output_dir: &Path) -> Self {
@ -34,7 +51,7 @@ impl FileMerge {
self.stacks.push(vec![]);
}
self.stacks[level].push(file);
if self.stacks[level].len() < NSTREAMS {
if self.stacks[level].len() < constants::NSTREAMS {
break;
}
let (filename, out) = self.tmp_dir.create()?;
@ -48,11 +65,11 @@ impl FileMerge {
}
pub fn finish(mut self) -> io::Result<()> {
let mut tmp = Vec::with_capacity(NSTREAMS);
let mut tmp = Vec::with_capacity(constants::NSTREAMS);
for stack in self.stacks {
for file in stack.into_iter().rev() {
tmp.push(file);
if tmp.len() == NSTREAMS {
if tmp.len() == constants::NSTREAMS {
merge_reversed(&mut tmp, &mut self.tmp_dir)?;
}
}
@ -63,7 +80,9 @@ impl FileMerge {
}
assert!(tmp.len() <= 1);
match tmp.pop() {
Some(last_file) => fs::rename(last_file, self.output_dir.join(MERGED_FILENAME)),
Some(last_file) => {
fs::rename(last_file, self.output_dir.join(constants::MERGED_FILENAME))
}
None => Err(io::Error::new(
io::ErrorKind::Other,
"no documents were parsed or none contained any words",
@ -122,7 +141,7 @@ fn merge_streams(files: Vec<PathBuf>, out: BufWriter<File>) -> io::Result<()> {
fn merge_reversed(filenames: &mut Vec<PathBuf>, tmp_dir: &mut TmpDir) -> io::Result<()> {
filenames.reverse();
let (merged_filename, out) = tmp_dir.create()?;
let mut to_merge = Vec::with_capacity(NSTREAMS);
let mut to_merge = Vec::with_capacity(constants::NSTREAMS);
mem::swap(filenames, &mut to_merge);
merge_streams(to_merge, out)?;
filenames.push(merged_filename);

@ -38,6 +38,7 @@ pub struct IndexFileReader {
/// Each entry in the table of contents is small. It consists of a string, the
/// `term`; summary information about that term, as used in the corpus (`df`);
/// and a pointer to bulkier data that tells more (`offset` and `nbytes`).
#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
pub struct Entry {
/// The term is a word that appears in one or more documents in the corpus.
/// The index file contains information about the documents that use this

@ -1,39 +1,41 @@
use std::io::{self, BufWriter};
use std::fs::{self, File};
use std::io::{self, BufWriter};
use std::path::{Path, PathBuf};
#[derive(Clone)]
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct TmpDir {
dir: PathBuf,
n: usize
n: usize,
}
impl TmpDir {
pub fn new<P: AsRef<Path>>(dir: P) -> Self {
Self {
dir: dir.as_ref().to_owned(),
n: 1
n: 1,
}
}
pub fn create(&mut self) -> io::Result<(PathBuf, BufWriter<File>)> {
let mut r#try = 1;
loop {
let filename = self.dir.join(PathBuf::from(format!("tmp{:08x}.dat", self.n)));
let filename = self
.dir
.join(PathBuf::from(format!("tmp{:08x}.dat", self.n)));
self.n += 1;
match fs::OpenOptions::new()
.write(true)
.create_new(true)
.open(&filename)
.write(true)
.create_new(true)
.open(&filename)
{
Ok(f) =>
return Ok((filename, BufWriter::new(f))),
Err(exc) =>
Ok(f) => return Ok((filename, BufWriter::new(f))),
Err(exc) => {
if r#try < 999 && exc.kind() == io::ErrorKind::AlreadyExists {
// keep going
} else {
return Err(exc);
}
}
}
r#try += 1;
}

@ -15,6 +15,7 @@ use std::path::PathBuf;
/// An index file has two parts. The main part of the file is a sequence of
/// entries, stored back-to-back; the
#[derive(Debug)]
pub struct IndexFileWriter {
/// The number of bytes written so far.
offset: u64,

Loading…
Cancel
Save